Creating a Blob from unicodes (code points) in javascript

1.4k views Asked by At

I am trying to create a utf-8 encoded html page from a Blob by only specifying unicodes for each character that i want to display within the page.

ex: I am trying to display characters 'a' and 'b' with a non breaking space in between.

var uint8 = new Uint8Array([97, 160, 98]); // 97 = a, 160 = non-breaking space, 98 = b

The Blob seems to work fine with no issues, if i just pass in code units within the ASCII range (0-127) But whenever there is a code unit which is greater than 127 (ex: code unit 160/non-breaking space) it gets displayed as an unrecognized character in the html.

Issue

Following is the code i used,

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
</head>
<body>
    <div id="container">
        <a  id="nav" target="_blank" href="#"> click to navigate </a> <br />
        <iframe src="" id="i-frame"> </iframe>
    </div>
    <script type="text/javascript">
        var uint8 = new Uint8Array([97, 160, 98]);
        var blob = new Blob([uint8], { type: "text/html;charset=UTF-8" });
        var url = URL.createObjectURL(blob);
        document.getElementById("nav").href = url;
        document.getElementById("i-frame").src= url;
    </script>
</body>
</html>

After some findings i found out that UTF-8 uses maximum of 4 bytes to represent a character and also after code unit 127 it requires two bytes to represent code units more than 127 (in UTF-8); so in order to make my unicode representation for the blob to work i had to create the blob as follows,

var uint8 = new Uint8Array([97, 194, 160, 98]);

Question 1: Do we need to use bit shifting like technique (as in https://gist.github.com/lihnux/2aa4a6f5a9170974f6aa) when we use code units more than 127?

Question 2: But if we do something similar to a Base64 string which has binary data like image or pdf we can get its output without no issues.

var base64EncodedString = 'ABC== etc..';
var decoded = atob(base64EncodedString);
var uint8 = new Uint8Array(decoded.length);
for (var i = 0; i < uint8.length; i++) {
    // creating a byte array out of code units which is same as html page create for the question 1
    uint8[i] = decoded.charCodeAt(i);
};

var blob = new Blob([uint8], { type: "image/jpeg" });
var url = URL.createObjectURL(blob);

Code for Question 2

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
</head>
<body>
    <div id="container">
        <a  id="nav" target="_blank" href="#"> click to navigate </a> <br />
        <iframe src="" id="i-frame"> </iframe>
    </div>
    <script type="text/javascript">
        var imgBase64String = "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxMSEhMSExMVFRUXFRgXGBgVFRcYFhYVFxUWFxcYFRUYHSggGRslGxUWITEiJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGi8lHyUtLS0tLS0tLS8tLS0tLS0tKy0tKy8tLS0tLS0tLS0tLS0tLy0tLS0tLS0tLS0tLS0tK//AABEIAOAA4QMBIgACEQEDEQH/xAAcAAEAAQUBAQAAAAAAAAAAAAAAAgMEBQYHAQj/xAA/EAACAQIDBgMFBgMGBwAAAAAAAQIDEQQhMQUGEkFRYXGBkQcTobHwFCIywdHhQmLxFSQzUoKSFhcjQ1Nysv/EABkBAQADAQEAAAAAAAAAAAAAAAABAgMEBf/EACIRAQEAAgICAwADAQAAAAAAAAABAhEDEiExBBNBFCJRcf/aAAwDAQACEQMRAD8A7gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGK21vDQwtlUl956RjnK3VrkiMspjN0ZUGgYn2iu74KCtycpu9uTaS+Fyz/5iYi/+HSa/1fqYfyuP/Ve0dLBomz/aPBu1ai4rrB3+Dt8zb9mbTpYiPFSmpLmtJLxi80aYcuGfqpllXgANEgAAAAAAAAAAAAAAAAAAAAAAAAAAAGK3hpV6lN0qFouWTm3a0eajzv3sVyupsYjeLeyNPip0mnJZOS5PpHr4nNMTNzk5OTcm823m33Zs+J3CxcVeMoS7KTT+KNbx2zqlB8NSnKL7r5PmeXzXkyu8ozu/1GnFEZ0rZEaditHPmZzFVaypWK2HxU6UlOnJwktHF2Yk+XyIu1rFtWDoGwd/U7QxKs9OOK+Mo/mvQ3ejVjOKlFqUWrpp3TXZnBGr+JnNgbeq4V3hK8OcG3wv9H3R1cXybPGS8ydiBj9j7YpYmkqsJK2jTdnGS1T+tGjUt8N+54bErD0VRajCNSpOpOyUJOSsrc7xXryyOy5yTs0xly9N9BrWA34wdSjTquqlxRu0k3ZrJ2y0unbseS37wSy45f7Sdw63/GzA1+hvng5O3vbeKdvVXMzhcZTqK8JxkuzTJRpXAAAAAAAAAAAAAAAAAAAAACniKEakXGcVKLyaayKgFmxpm2dwqc03Qk4S/wAsm3B+eq+Jpu0t1cZRu3SlJLnT+/52WaOylvtDHU6FOVWrJQhFXbZzZ/FwvmeFeu3AMTKcX96LXimnfwZ59tb/ABGT3y9plTEt08PHhpLRySvLvY1D+1an8cYS7xTi/Pl8PM5suLV8Xbb+JyWbjOe/XJ2+Pw/cvKGKXVetvn4mEwlaFRpK93y5+Fi+VBR1us+ngU9OfKXHxYvvtPDL8UoqSz79L37/ADNO9oNBqtTm84yp5NrnGT4l5cS9TYMbWcKcnorc+fl6Gr7zYqc4UryUqcW1HK0ouaV1fnF8HPNWOjiv46/j+cL4dN2ZudBUqalKTtGPO3L8yeN3QStwNrq9f6lTZW1fexp/fjFcEW7zSd+FZcOuvYy/9opfxuTWdlaK9Xm/ga9sXV0z21p7uTpLjm8uza+RW2TRr3vSUr/5k7JeXP1MzU2p7zKcINfzXa8ypS204pxjFRS5KKt8Csym/bXLiyuOuvlndjbQxcLKtOnNd5JS9TacPXU1dfNP4o5jVx8amaeZYVtpVaUrwm4tdGzackc2XxLP12IHP93d/rtU8RzyUl+fU32lVUkpRaaejRo5MsbjdVMABUAAAAAAAAAAAAAAAB43bNnAPaPvg8fXlSpy/u1N2il/3ZJ2c326dvE6B7Y95XhsKsPTdquIvG61jSX432vdRXi+hw2kjDmz/I7vicW72q6pUll9ZmybC2Iqv1qYClBq11yuZ3d3azo1I3eTavc5cbO3l6tl6f19tkW5NN3urdLfK5Z1d15U78Da8GbrgtsUqsVwtN9tLdcy5tGSOrpjY8y8uW9ZOaUcHTg375Od8rSuyrhKdODX2fDR4r/ineXC+3Fe2vI3tbGpSd9cy5hs6nBZWHRH3YT1GuYDY1SUc1GN3d8MUuXUy9HYUIxV1drVvMvHi4RWq8i2q7YjyK/Xji0+/kzef2bEtsZsiPLIVNspFhjNuxXO/mV/q0n2LHE4JRvxNro1179jC1Mbf7rd7adfAudqbU4ot36mo1MRJyur9fLW5XXXy6Jlc5qs3i3bR3Tz8H+ptO4e+DpSVKpK9Nu2esWc/jjL6+aKdKtwybu+3frny6mmHL5cnPw7j6ghJNJp3TzT7HppHsz3g99SdCbvKH4X1j+xu50vMs1dAACAAAAAAAAAAAAC32jilSpVKr0hCU/9sW/yA+evaVtX7RtOvJ5wpP3MVfK0Pxes3I12jHmU3VlNucs5Tbk/GTbfxbLmMbNx53+szhzy3dvb4cOskV4Tyt3vfn4FSMijcg5GOnV20yOFx1Sm7xkzKf8AFVVpRV0+TRr0alrFLjNMcrIzzxxy9xseH3trwbTk2i9jvfK1m5eenkajGfLun6d/MuHZRbtp+5pM6zvDhfxn8RvBKS5mPW15Wsm07u+evTL19TE+/bV/gW7qGeVtaYzHH1GbrbVk1qWNTFN3zLOM7kZSKTFa8nhcKtdNNvTLx7lGnVcXdOzz07qz+Z5Vmnaytlnnq88+xTRdnaqUqtnpyaz7nnvfgU0SryXArL713d9Vyy9fUmKW+G3bg7TdLEwknpKztzTdj6CjK6TWjVz5d3ak4zT7r5n0ju5iPeYalL+W3pkduF3i8nnms2SABZiAAAAAAAAAAAaz7SsRwbMxb5uk4L/W1H8zZjT/AGsxvsyuu8P/ALRGXqr8c3nP+vn2loivSazvrbLxvz8rlOGhGDzOCvbl0rp5Hsa3De3NNadfEpspMiRa5K3K9+ehGXjfL0fQpXBOldqsJCcyB5LJ2JNpU5EJs9sQkrN9granQkr59OXXkeSkeWyXxI3BtI9uQbJWA9hUtfK+X0ytTw3G7LPJPwfRkKMLmw7Iw8UvXLkvrL0L4Y7rPPPrFHYmDcNVz+B3fcp/3WK6Nr5HJMDTvI67udG2GXeTfyOvGajzOa7rOAAlkAAAAAAAAAAAa/v7hfebPxMefu+JeMWn+RsBRxlBTpzg/wCKLj6qwvlON1dvlKT5EFJ8jL7wYD3VWUbaNmHepxXHT2Jn2m4rX5E3OycbLNp3tnlfR+fyKDqt5vw8kRlK5XS/ZUeXoEjyMu77+IjKzaa5c++jJQmnyKcos8jNp3XIlUq3zJ0be3yKbbF1w87538OWXqexS7+pERfKV8iCRUnQa75Xy5dn3EUubGkoqJJakp2ysuWfdkVC+nQnQr0GZ/ZcrtZ816GswdjZthUsr88jXBhy1ndnwzy5s7FsOjwUKcf5fmc03cwDnUiu51eEbJJclY6J6ednd1IABQAAAAAAAAAAAAAcj9pO7qVaVRJ2kr5d9ficxxeEfG+d809L2/ofSm8uzFXotW+8s1+aOG7f2Y4vSzTMuTHcdvxuT8rUbaohGLLqdJ8Tyzvp18ChGdnllrp35HM7bEYyKjtbLXn+xCydlzvqSpQTlZu3fVfAEUrnsIu1+V7FT3TbsmtMrtJc3qyNKDzXn6fTJ2jXlKC7ft3JKVjyku/79io7ZJrS92tX43IWk8PXN2S73KahfTW9rc3cmnb0sU1HqSmql76+H6Hk1Z2fLoRbPITtnfPTyJ0japBp2Vud7m7bvYa8U+X9DScLBylFRXO1+rOr7A2bdQpRWeSf5m/FHH8jLTbdy9n2TqNaZLx5m2FHB4dU4RgtEisauAAAAAAAAAAAAAAAAANC363fWdWKyetuT/Q30hWpKcXGSumrNBMurt81bXwDi76P5mEnC6fU6vvnsH3EmmrwlnGX5eJzfaGF4JduqOfPDVelxcvbHVYzi0yXQ8v0yJVPwtWWt78yhN+OhXqtc9Kr1J5eVvr4lDj0fPnd6slGS6Z3+HgR1TM1eCWpGMswnz5X5aeRGM9ejel9PH1I0v2VJVLSuvIj7y6d34IhHrk/G+Z4s33fzJ0i5KvA7X6fnoRvdW0tfO2bfQQlr/UrYDDSqNLvkvEt12i5SRm909mt1OJrT5s7hufs3hj7xrsvzZom6uyfvU6MV9569lzkzrtCkoRUVolZHRjNR5nLn2yTABLMAAAAAAAAAAAAAAAAAISqWAttq7OhiKcqU1k9HzT5NHBd8tj1MJVlTqLLWMuUo9U/qx32pirGu704fD4uk6VdXWsZLKUJdYvkRZtfDK4187VZJPsWdStn2uZzebdPEYeTdP8A69LlKK+8l/NDW/hc1GdZp24Wn0s7+hn1dP2r9VcrPx+mVlMt8Pg6rV3Hh/8AbJvyE4yjlJNd9blbF8c4vKkvgiKfV2LR1vpnk6+mZGmn2Rdudly+F/Ihxr6ZaPFdPQnClUab4Wl3y+eY6q3ki+w95Oy1f6m27JjCgk8p1nolna/XuaVs3B16s1FNUk9ZO7y8sztW4WwMLhkp3dat/wCSotH/ACR5fFmmM05+Tk7Nt3H2O8PSdSr/AI1TOV/4Y8o/mzZ1UMZTxFy4hM0c69TPShCRViEJABgAAAAAAAAAAAAAHkkUKlNlwAMXXw7ZiMdsuUja7EXTQTty7ae7FWV7Nmv1tyKl78J290F0IvCx6EaT2cJqbnVf8rKE9z6jVnBtHe3g49Dz7FDoNJ7184Yn2eVW/u8UV0tc9pez2pzi343+R9HfYYdB9hh0RHU71wHDbiVF/DbwRk6G4knqmdsWCh0JLCx6E6O9cu2duYo/wmzYHY3BojbVQj0JKmug0i5MVh8K0XlOiXSR6SqhGmTSAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA//2Q==";

        var decoded = atob(imgBase64String);

        var uint8 = new Uint8Array(decoded.length);
        for (var i = 0; i < uint8.length; i++) {
            uint8[i] = decoded.charCodeAt(i);
        };

        var blob = new Blob([uint8], { type: "image/jpeg" });
        var url = URL.createObjectURL(blob);
        document.getElementById("nav").href = url;
        document.getElementById("i-frame").src= url;
    </script>
</body>
</html>

Is this second approach works because binary files like images, pdf has no charsets?

Can anyone please kindly explain these two cases.

1

There are 1 answers

0
Kaiido On

The code points you use are the ones of UTF-16. The U+00A0 NO-BREAK SPACE character in UTF-8 is represented by the two bytes 0xC2 0xA0.

new Blob( [ <ArrayBuffer> ] ) won't change the data passed to the Blob at all, so the bytes in the ArrayBuffer will also be in the resulting Blob.
So at this time you created a text file encoded in UTF-16.
Then, you tell the browser to fetch it as an html document encoded in UTF-8. When it sees the 0xA0 byte alone, it doesn't know what to do and replaces it with the U+FFFD REPLACEMENT CHARACTER.

So if you want to make a Blob from the string "a\u00A0b" encoded in UTF-8, you can either pass that string directly, since new Blob( [ <DOMString> ] ) does encode the DOMString to UTF-8 automatically

var data = "a\u00a0b";
var blob = new Blob([data], { type: "text/html;charset=UTF-8" });
var url = URL.createObjectURL(blob);
document.getElementById("i-frame").src= url;
<div id="container">
    <iframe src="" id="i-frame"> </iframe>
</div>

or if you really just have this ArrayBuffer filled with some UTF-16 code points in an Uint8Array (which seems utterly weird since not all UTF-16 characters can be represented in a single byte), then you can generate a DOMString from these code points:

var uint8 = new Uint8Array([97, 160, 98]);
var data = [...uint8].map( (code) => String.fromCharCode(code) ).join( "" );
var blob = new Blob([data], { type: "text/html;charset=UTF-8" });
var url = URL.createObjectURL(blob);
document.getElementById("i-frame").src= url;
<div id="container">
    <iframe src="" id="i-frame"> </iframe>
</div>

And finally if you have the data properly encoded in UTF-16, with correctly 2 bytes per character (i.e in a Uint16Array), then you can use a TextDecoder:

var uint16 = new Uint16Array([97, 160, 98]);
var data = new TextDecoder("utf-16").decode( uint16 );
var blob = new Blob([data], { type: "text/html;charset=UTF-8" });
var url = URL.createObjectURL(blob);
document.getElementById("i-frame").src= url;
<div id="container">
    <iframe src="" id="i-frame"> </iframe>
</div>

Or, if you really just want the Uint8Array of this UTF-8 text, then use the correct values:

var uint8 = new Uint8Array([97, 0xC2, 0xA0, 98]);
var blob = new Blob([uint8], { type: "text/html;charset=UTF-8" });
var url = URL.createObjectURL(blob);
document.getElementById("i-frame").src= url;
<div id="container">
    <iframe src="" id="i-frame"> </iframe>
</div>