Hi experts, Trying to use the "slicing" functionality in strings as part of a Spark program (PySpark) I get this error:
**** Code **** import pandas as pd from pyspark.sql import SQLContext hc = SQLContext(sc) A = pd.DataFrame({'Firstname': ['James', 'Ali', 'Daniel'], 'Lastname': ['Jones', 'Bajwa', 'Day']}) a = hc.createDataFrame(A) print A b = a.select(a.Firstname[:2]) print b.toPandas() c = a.select(a.Lastname[2:]) print c.toPandas() Output: Firstname Lastname 0 James Jones 1 Ali Bajwa 2 Daniel Day SUBSTR(Firstname, 0, 2) 0 Ja 1 Al 2 Da --------------------------------------------------------------------------- Py4JError Traceback (most recent call last) <ipython-input-17-6ee5d7d069ce> in <module>() 10 b = a.select(a.Firstname[:2]) 11 print b.toPandas() ---> 12 c = a.select(a.Lastname[2:]) 13 print c.toPandas() /home/jupyter/spark-1.3.1/python/pyspark/sql/dataframe.pyc in substr(self, startPos, length) 1089 raise TypeError("Can not mix the type") 1090 if isinstance(startPos, (int, long)): -> 1091 jc = self._jc.substr(startPos, length) 1092 elif isinstance(startPos, Column): 1093 jc = self._jc.substr(startPos._jc, length._jc) /home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, --> 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: /home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 302 raise Py4JError( 303 'An error occurred while calling {0}{1}{2}. Trace:\n{3}\n'. --> 304 format(target_id, '.', name, value)) 305 else: 306 raise Py4JError( Py4JError: An error occurred while calling o1887.substr. Trace: py4j.Py4JException: Method substr([class java.lang.Integer, class java.lang.Long]) does not exist at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:333) at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:342) at py4j.Gateway.invoke(Gateway.java:252) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:207) at java.lang.Thread.run(Thread.java:745) Looks like X[:2] works but X[2:] fails with the error above Anyone else have this issue? Clearly I can use substr() to workaround this, but if this is a confirmed bug we should open a JIRA. Thanks, Ali