I have the Python script:

import time

from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

from urllib.parse import urlsplit, unquote


def extractPath(host, url):
    if host in url:
        return urlsplit(url).path
    else:
        return '-'

startCreateUdfs = time.time()
getPathUdf = udf(extractPath, StringType())
endCreateUdfs = time.time()

print("Python udf creation time: {}".format(endCreateUdfs - startCreateUdfs))

and the Scala script:

import java.net.URLDecoder
import java.nio.charset.StandardCharsets
import java.net.URL

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.udf

object UdfTimes extends App{

  val spark = SparkSession.builder().master("local").getOrCreate()

  spark.sparkContext.setLogLevel("ERROR")

  val extractPath: (String, String) => String = (host, url) => {
    if (url.contains(host))
      new URL(url).getPath
    else
      "-"
  }
  val unquote: String => String = str => URLDecoder.decode(str, StandardCharsets.UTF_8.name())

  val startTimeUdf = System.nanoTime()
  val getPathUdf = udf(extractPath)
  val endTimeUdf = System.nanoTime()

  println("Scala udf registering time: " + (endTimeUdf - startTimeUdf) / math.pow(10, 9))
}

Which I have written to do the same thing. The udf creation is instant in Python (from command line):

Python udf creation time: 2.0503997802734375e-05

but in Scala, it takes almost a second (sbt command line):

udf registering time: 0.768687091

What is the reason for this big difference?

0 Answers