pyspark FileNotFoundError when SparkSession.builder.appName

Question

pyspark FileNotFoundError when SparkSession.builder.appName

425 views Asked by Jay Wang At 31 March 2023 at 21:02

win10, python 3.9.7

Just installed pyspark and tried my first three lines of code below:

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 4
      1 import pyspark
      2 from pyspark.sql import SparkSession
----> 4 spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
    267     sparkConf.set(key, value)
    268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
    270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
    271 # by all sessions.
    272 session = SparkSession(sc, options=self._options)

File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
    481 with SparkContext._lock:
    482     if SparkContext._active_spark_context is None:
--> 483         SparkContext(conf=conf or SparkConf())
    484     assert SparkContext._active_spark_context is not None
    485     return SparkContext._active_spark_context

File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:195, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
    189 if gateway is not None and gateway.gateway_parameters.auth_token is None:
    190     raise ValueError(
    191         "You are trying to pass an insecure Py4j gateway to Spark. This"
    192         " is not allowed as it is a security risk."
    193     )
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    196 try:
    197     self._do_init(
    198         master,
    199         appName,
   (...)
    208         udf_profiler_cls,
    209     )

File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:417, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
    415 with SparkContext._lock:
    416     if not SparkContext._gateway:
--> 417         SparkContext._gateway = gateway or launch_gateway(conf)
    418         SparkContext._jvm = SparkContext._gateway.jvm
    420     if instance:

File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\java_gateway.py:99, in launch_gateway(conf, popen_kwargs)
     96     proc = Popen(command, **popen_kwargs)
     97 else:
     98     # preexec_fn not supported on Windows
---> 99     proc = Popen(command, **popen_kwargs)
    101 # Wait for the file to appear, or for the process to exit, whichever happens first.
    102 while not proc.poll() and not os.path.isfile(conn_info_file):

File ~\Anaconda3\envs\pyspark_env\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
   1020         if self.text_mode:
   1021             self.stderr = io.TextIOWrapper(self.stderr,
   1022                     encoding=encoding, errors=errors)
-> 1024     self._execute_child(args, executable, preexec_fn, close_fds,
   1025                         pass_fds, cwd, env,
   1026                         startupinfo, creationflags, shell,
   1027                         p2cread, p2cwrite,
   1028                         c2pread, c2pwrite,
   1029                         errread, errwrite,
   1030                         restore_signals,
   1031                         gid, gids, uid, umask,
   1032                         start_new_session, process_group)
   1033 except:
   1034     # Cleanup if the child failed starting.
   1035     for f in filter(None, (self.stdin, self.stdout, self.stderr)):

File ~\Anaconda3\envs\pyspark_env\Lib\subprocess.py:1493, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
   1491 # Start the process
   1492 try:
-> 1493     hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1494                              # no special security
   1495                              None, None,
   1496                              int(not close_fds),
   1497                              creationflags,
   1498                              env,
   1499                              cwd,
   1500                              startupinfo)
   1501 finally:
   1502     # Child is launched. Close the parent's copy of those pipe
   1503     # handles that only the child should have open.  You need
   (...)
   1506     # pipe will not close when the child process exits and the
   1507     # ReadFile will hang.
   1508     self._close_pipe_fds(p2cread, p2cwrite,
   1509                          c2pread, c2pwrite,
   1510                          errread, errwrite)

FileNotFoundError: [WinError 2] The system cannot find the file specified

I installed pyspark in another Anaconda environment but still received the same error.

I also tried the following code:

import os
print(os.environ.get("SPARK_HOME"))
print(os.path.join(os.environ.get("SPARK_HOME"), './bin/spark-submit.cmd'))

and got:

C:\Spark\spark-3.1.2-bin-hadoop2.7

C:\Spark\spark-3.1.2-bin-hadoop2.7./bin/spark-submit.cmd

Original Q&A

There are 1 answers

**CRAFTY DBA** · Answer 1 · 2023-03-31T22:34:46+00:00

Did you install spark locally? Just installing the pyspark library does not do the trick.

If I had anaconda on my laptop, I would have to specify how to connect to the remote spark cluster.

spark = SparkSession.builder.master("spark://<ip>:<port>").getOrCreate()

Check out this article for details. It is for Jupter notebooks to EMR (AWS - Spark Cluster). But concepts are the same for remote clusters.

In short, need more information to solve you problem. Please validate where Spark is installed.

TechQA.

pyspark FileNotFoundError when SparkSession.builder.appName

There are 1 answers

Related Questions in PYSPARK

Popular Questions

Popular Tags

Trending Questions