During initiating my Spark session, I'm getting an error message related to Hive.
My Spark session config is as follows:
# Assuming available resources: 250 vcores, 1TB memory, 2 people sharing the queue
# Optimazed for 250M rows, 1,000 columns, 50% sparsity
from os import environ
from os.path import abspath
from pyspark.sql import SparkSession
import re
import getpass
environ["PYSPARK_PYTHON"]="/opt/cloudera/parcels/Anaconda/envs/python37/bin/python"
environ["PYSPARK_DRIVER_PYTHON"]="/opt/cloudera/parcels/Anaconda/envs/python37/bin/python"
environ["SPARK_HOME"]="/opt/cloudera/parcels/CDH/lib/spark"
environ["YARN_CONF_DIR"]="/etc/hive/conf"
environ["ARROW_LIBHDFS_DIR"]="/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/"
environ["HTTPS_PROXY"]=https://proxy.p1at.s-group.cc:8080
environ["HTTP_PROXY"]=http://proxy.p1at.s-group.cc:8080
environ["HADOOP_CONF_DIR"]="/etc/hive/conf"
spark = SparkSession \
.builder \
.config("spark.master", "yarn") \
.appName("CRM: AT-PRK-EY-LFTO") \
.config("spark.yarn.queue","root.project.mars") \
.config("spark.sql.warehouse.dir", abspath('spark-warehouse')) \
.config("spark.executor.memory", "16g") \
.config("spark.driver.memory", "20g") \
.config("spark.dynamicAllocation.enabled", "true") \
.config("spark.dynamicAllocation.minExecutors", "1") \
.config("spark.dynamicAllocation.maxExecutors", "25") \
.config("spark.executor.cores", "5") \
.config("spark.memory.offHeap.enabled", "true") \
.config("spark.memory.offHeap.size", "20g") \
.config("spark.default.parallelism", "750") \
.config("spark.sql.shuffle.partitions", "750") \
.config("spark.kryoserializer.buffer.max", "512m") \
.config("spark.network.timeout", "300s") \
.config("spark.executor.heartbeatInterval", "60s") \
.config("spark.speculation", "true") \
.config("spark.speculation.interval", "10000ms") \
.config("spark.speculation.multiplier", "1.5") \
.config("spark.speculation.quantile", "0.75") \
.config("spark.sql.files.maxPartitionBytes", "512m") \
.config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
.config("spark.driver.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
.enableHiveSupport() \
.getOrCreate()
print("Spark Session OK - for all settings run spark.sparkContext.getConf().getAll()")
print(f'- Spark UI link: http://bdagb1node03.eb.lan.at:18088/history/{spark.sparkContext.applicationId}')
print(f'- application_id: {spark.sparkContext.applicationId}')
# .config('spark.submit.deployMode', deployMode)\
The error message is:
Py4JJavaError Traceback (most recent call last)
/opt/cloudera/parcels/MINIFORGE-2022.2/envs/crm_at_prk/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
62 try:
63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
/opt/cloudera/parcels/MINIFORGE-2022.2/envs/crm_at_prk/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling o140.sessionState.
java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
/tmp/ipykernel_72512/653573609.py in
42 .config("spark.sql.files.maxPartitionBytes", "512m") \
43 .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
44 .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC -XX:G1HeapRegionSize=16m -XX:MaxGCPauseMillis=500 -XX:InitiatingHeapOccupancyPercent=35") \
45 .enableHiveSupport() \
46 .getOrCreate()
/opt/cloudera/parcels/MINIFORGE-2022.2/envs/crm_at_prk/lib/python3.7/site-packages/pyspark/sql/session.py in getOrCreate(self)
181 session = SparkSession(sc)
182 for key, value in self._options.items():
183 session._jsparkSession.sessionState().conf().setConfString(key, value)
184 for key, value in self._options.items():
185 session.sparkContext._conf.set(key, value)
/opt/cloudera/parcels/MINIFORGE-2022.2/envs/crm_at_prk/lib/python3.7/site-packages/py4j/java_gateway.py in call(self, *args)
1321 answer = self.gateway_client.send_command(command)
1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1324
1325 for temp_arg in temp_args:
/opt/cloudera/parcels/MINIFORGE-2022.2/envs/crm_at_prk/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"
I'm running no more Spark applications under my user. Also, if you can validate or give me a feedback on the spark session configuration, that'd be great appreciated!