I wrote code that downloads data from the Florida Department of Emergency Management website. For months the code has worked just fine. However today when I run it I get the error below. I even tested just the wget with the direct link to one of the files and still got the same error. I've double checked my user agent. I modified my header. If I download a file from the site using wget without running my selenium-based script I can download them individually. I'm guessing the site is blocking me because it recognizes me as a bot, but I'm not sure how. Can anyone explain why this is still happening and what I can do to fix the issue?
USER AGENT: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36
CODE:
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(chrome_path,options=chrome_options)
print('Starting Data Download')
link_counter = 0
download_counter = 0
link_n = len(result_full) -152
download_list = []
for links in result_full:
if links.text.find('Data Report') > 0:
link_url = links.get_attribute('href')
filename = wget.filename_from_url(link_url)
if not os.path.exists(f'{pdf_output_path}/{filename}'):
wget.download(link_url, out = f'{pdf_output_path}')
download_counter += 1
download_list.append
print("Downloading", links.text)
link_counter +=1
print (f'{round((link_counter)*100/link_n,2)}% Complete')
print('Download of New Files Complete')
print(f'{download_counter} Files Created')
ERROR:
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-112-3aafa377b505> in <module>
9 filename = wget.filename_from_url(link_url)
10 if not os.path.exists(f'{pdf_output_path}/{filename}'):
---> 11 wget.download(link_url, out = f'{pdf_output_path}')
12 download_counter += 1
13 download_list.append
E:\Anaconda\lib\site-packages\wget.py in download(url, out, bar)
524 else:
525 binurl = url
--> 526 (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
527 filename = detect_filename(url, out, headers)
528 if outdir:
E:\Anaconda\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
E:\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
E:\Anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
E:\Anaconda\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
E:\Anaconda\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
E:\Anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
E:\Anaconda\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
If you have an error 403, that means you dont have permission, according to the status codes. which you can check here: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
You might need some kind of authentication on the website, you might need to post some credentials through your app and then try to download.