I'm learning pandas and trying to get the table of a wikipedia page in Jupyter and I got this error:
UnicodeEncodeError: 'ascii' codec can't encode character '\xed' in position 16: ordinal not in range(128)
df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')
and I get
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
Cell In[14], line 1
----> 1 df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1212, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend)
1208 check_dtype_backend(dtype_backend)
1210 io = stringify_path(io)
-> 1212 return _parse(
1213 flavor=flavor,
1214 io=io,
1215 match=match,
1216 header=header,
1217 index_col=index_col,
1218 skiprows=skiprows,
1219 parse_dates=parse_dates,
1220 thousands=thousands,
1221 attrs=attrs,
1222 encoding=encoding,
1223 decimal=decimal,
1224 converters=converters,
1225 na_values=na_values,
1226 keep_default_na=keep_default_na,
1227 displayed_only=displayed_only,
1228 extract_links=extract_links,
1229 dtype_backend=dtype_backend,
1230 )
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1001, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
999 else:
1000 assert retained is not None # for mypy
-> 1001 raise retained
1003 ret = []
1004 for table in tables:
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:981, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
980 try:
--> 981 tables = p.parse_tables()
982 except ValueError as caught:
983 # if `io` is an io-like object, check if it's seekable
984 # and try to rewind it before trying the next parser
985 if hasattr(io, "seekable") and io.seekable():
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:257, in _HtmlFrameParser.parse_tables(self)
249 def parse_tables(self):
250 """
251 Parse and return all tables from the DOM.
252
(...)
255 list of parsed (header, body, footer) tuples from tables.
256 """
--> 257 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
258 return (self._parse_thead_tbody_tfoot(table) for table in tables)
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:666, in _BeautifulSoupHtml5LibFrameParser._build_doc(self)
663 def _build_doc(self):
664 from bs4 import BeautifulSoup
--> 666 bdoc = self._setup_build_doc()
667 if isinstance(bdoc, bytes) and self.encoding is not None:
668 udoc = bdoc.decode(self.encoding)
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:658, in _BeautifulSoupHtml5LibFrameParser._setup_build_doc(self)
657 def _setup_build_doc(self):
--> 658 raw_text = _read(self.io, self.encoding)
659 if not raw_text:
660 raise ValueError(f"No text parsed from document: {self.io}")
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:155, in _read(obj, encoding)
149 text: str | bytes
150 if (
151 is_url(obj)
152 or hasattr(obj, "read")
153 or (isinstance(obj, str) and file_exists(obj))
154 ):
--> 155 with get_handle(obj, "r", encoding=encoding) as handles:
156 text = handles.handle.read()
157 elif isinstance(obj, (str, bytes)):
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:716, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
713 codecs.lookup_error(errors)
715 # open URLs
--> 716 ioargs = _get_filepath_or_buffer(
717 path_or_buf,
718 encoding=encoding,
719 compression=compression,
720 mode=mode,
721 storage_options=storage_options,
722 )
724 handle = ioargs.filepath_or_buffer
725 handles: list[BaseBuffer]
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:368, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
366 # assuming storage_options is to be interpreted as headers
367 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 368 with urlopen(req_info) as req:
369 content_encoding = req.headers.get("Content-Encoding", None)
370 if content_encoding == "gzip":
371 # Override compression based on Content-Encoding header
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:270, in urlopen(*args, **kwargs)
264 """
265 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
266 the stdlib.
267 """
268 import urllib.request
--> 270 return urllib.request.urlopen(*args, **kwargs)
File ~/anaconda3/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File ~/anaconda3/lib/python3.11/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
516 req = meth(req)
518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
521 # post-process response
522 meth_name = protocol+"_response"
File ~/anaconda3/lib/python3.11/urllib/request.py:536, in OpenerDirector._open(self, req, data)
533 return result
535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
537 '_open', req)
538 if result:
539 return result
File ~/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File ~/anaconda3/lib/python3.11/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
1390 def https_open(self, req):
-> 1391 return self.do_open(http.client.HTTPSConnection, req,
1392 context=self._context, check_hostname=self._check_hostname)
File ~/anaconda3/lib/python3.11/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
1346 try:
1347 try:
-> 1348 h.request(req.get_method(), req.selector, req.data, headers,
1349 encode_chunked=req.has_header('Transfer-encoding'))
1350 except OSError as err: # timeout error
1351 raise URLError(err)
File ~/anaconda3/lib/python3.11/http/client.py:1286, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
1283 def request(self, method, url, body=None, headers={}, *,
1284 encode_chunked=False):
1285 """Send a complete request to the server."""
-> 1286 self._send_request(method, url, body, headers, encode_chunked)
File ~/anaconda3/lib/python3.11/http/client.py:1297, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
1294 if 'accept-encoding' in header_names:
1295 skips['skip_accept_encoding'] = 1
-> 1297 self.putrequest(method, url, **skips)
1299 # chunked encoding will happen if HTTP/1.1 is used and either
1300 # the caller passes encode_chunked=True or the following
1301 # conditions hold:
1302 # 1. content-length has not been explicitly set
1303 # 2. the body is a file or iterable, but not a str or bytes-like
1304 # 3. Transfer-Encoding has NOT been explicitly set by the caller
1306 if 'content-length' not in header_names:
1307 # only chunk body if not explicitly set for backwards
1308 # compatibility, assuming the client code is already handling the
1309 # chunking
File ~/anaconda3/lib/python3.11/http/client.py:1135, in HTTPConnection.putrequest(self, method, url, skip_host, skip_accept_encoding)
1131 self._validate_path(url)
1133 request = '%s %s %s' % (method, url, self._http_vsn_str)
-> 1135 self._output(self._encode_request(request))
1137 if self._http_vsn == 11:
1138 # Issue some standard headers for better HTTP/1.1 compliance
1140 if not skip_host:
1141 # this header is issued *only* for HTTP/1.1
1142 # connections. more specifically, this means it is
(...)
1152 # but the host of the actual URL, not the host of the
1153 # proxy.
File ~/anaconda3/lib/python3.11/http/client.py:1215, in HTTPConnection._encode_request(self, request)
1213 def _encode_request(self, request):
1214 # ASCII also helps prevent CVE-2019-9740.
-> 1215 return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode character '\xed' in position 16: ordinal not in range(128)
I tried to put .encode('UTF-8') but It doesn't work
I couldn't get the page to read successfully using pandas alone (credential error). Reading the page with the
requests
module and passing the text content to pandas worked, but:urllib.parse.quote()
can be used on the original string.match
needs to be text in the table itself, not a Wikipedia headerOutput: