win10, python 3.9.7
Just installed pyspark and tried my first three lines of code below:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[1], line 4
1 import pyspark
2 from pyspark.sql import SparkSession
----> 4 spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\sql\session.py:269, in SparkSession.Builder.getOrCreate(self)
267 sparkConf.set(key, value)
268 # This SparkContext may be an existing one.
--> 269 sc = SparkContext.getOrCreate(sparkConf)
270 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
271 # by all sessions.
272 session = SparkSession(sc, options=self._options)
File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:483, in SparkContext.getOrCreate(cls, conf)
481 with SparkContext._lock:
482 if SparkContext._active_spark_context is None:
--> 483 SparkContext(conf=conf or SparkConf())
484 assert SparkContext._active_spark_context is not None
485 return SparkContext._active_spark_context
File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:195, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls)
189 if gateway is not None and gateway.gateway_parameters.auth_token is None:
190 raise ValueError(
191 "You are trying to pass an insecure Py4j gateway to Spark. This"
192 " is not allowed as it is a security risk."
193 )
--> 195 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
196 try:
197 self._do_init(
198 master,
199 appName,
(...)
208 udf_profiler_cls,
209 )
File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\context.py:417, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
415 with SparkContext._lock:
416 if not SparkContext._gateway:
--> 417 SparkContext._gateway = gateway or launch_gateway(conf)
418 SparkContext._jvm = SparkContext._gateway.jvm
420 if instance:
File ~\Anaconda3\envs\pyspark_env\Lib\site-packages\pyspark\java_gateway.py:99, in launch_gateway(conf, popen_kwargs)
96 proc = Popen(command, **popen_kwargs)
97 else:
98 # preexec_fn not supported on Windows
---> 99 proc = Popen(command, **popen_kwargs)
101 # Wait for the file to appear, or for the process to exit, whichever happens first.
102 while not proc.poll() and not os.path.isfile(conn_info_file):
File ~\Anaconda3\envs\pyspark_env\Lib\subprocess.py:1024, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
1020 if self.text_mode:
1021 self.stderr = io.TextIOWrapper(self.stderr,
1022 encoding=encoding, errors=errors)
-> 1024 self._execute_child(args, executable, preexec_fn, close_fds,
1025 pass_fds, cwd, env,
1026 startupinfo, creationflags, shell,
1027 p2cread, p2cwrite,
1028 c2pread, c2pwrite,
1029 errread, errwrite,
1030 restore_signals,
1031 gid, gids, uid, umask,
1032 start_new_session, process_group)
1033 except:
1034 # Cleanup if the child failed starting.
1035 for f in filter(None, (self.stdin, self.stdout, self.stderr)):
File ~\Anaconda3\envs\pyspark_env\Lib\subprocess.py:1493, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
1491 # Start the process
1492 try:
-> 1493 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1494 # no special security
1495 None, None,
1496 int(not close_fds),
1497 creationflags,
1498 env,
1499 cwd,
1500 startupinfo)
1501 finally:
1502 # Child is launched. Close the parent's copy of those pipe
1503 # handles that only the child should have open. You need
(...)
1506 # pipe will not close when the child process exits and the
1507 # ReadFile will hang.
1508 self._close_pipe_fds(p2cread, p2cwrite,
1509 c2pread, c2pwrite,
1510 errread, errwrite)
FileNotFoundError: [WinError 2] The system cannot find the file specified
I installed pyspark in another Anaconda environment but still received the same error.
I also tried the following code:
import os
print(os.environ.get("SPARK_HOME"))
print(os.path.join(os.environ.get("SPARK_HOME"), './bin/spark-submit.cmd'))
and got:
C:\Spark\spark-3.1.2-bin-hadoop2.7
C:\Spark\spark-3.1.2-bin-hadoop2.7./bin/spark-submit.cmd
Did you install spark locally? Just installing the pyspark library does not do the trick.
If I had anaconda on my laptop, I would have to specify how to connect to the remote spark cluster.
Check out this article for details. It is for Jupter notebooks to EMR (AWS - Spark Cluster). But concepts are the same for remote clusters.
In short, need more information to solve you problem. Please validate where Spark is installed.