while extracting the pdf document into text i'm getting an issue, its removing extra space and converting into single space Attached the pdf doc image for ref - enter image description here.
PDF data - "Cycle started Quick, 01/09/2023 12:03"
Example - "Cycle0started0000Quick,001/09/20230000012:03" after extracting into text "Cycle0started0Quick,001/09/2023012:03"
in the above example i represented spaces from 0
current version witch having the space issue is
"pdfjs-dist": "3.11.174"
The version i was using previously which doesn't have space issue
"pdfjs-dist": "2.11.338",
I'm using the below code to extracting the PDF document
import pdfjs from 'pdfjs-dist/build/pdf.js';
import * as pdfWorker from 'pdfjs-dist/build/pdf.worker.js';
import logger from './utils/logger.js';
import fs from 'fs/promises';
pdfjs.GlobalWorkerOptions.workerSrc = pdfWorker;
const pdfToText = async function ({ file, dataBuffer, startPage = 1, endPage = Number.MAX_VALUE, columnSeparator = '', rowSeparator = '\n', renderOptions }) {
try {
if (file) {
dataBuffer = Uint8Array.from(await fs.readFile(file));
} else {
dataBuffer = Uint8Array.from(dataBuffer);
}
const doc = await pdfjs.getDocument(dataBuffer).promise;
const result = {
version: pdfjs.version,
numPages: doc.numPages,
metaData: await doc.getMetadata(),
}
result.info = result.metaData.info;
endPage = Math.min(endPage, doc.numPages);
const text = [];
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
const page = await doc.getPage(pageNumber);
const textContent = await page.getTextContent(renderOptions);
let lastY, row = [];
const pageText = [];
for (const item of textContent.items) {
if (lastY !== item.transform[5]) {
row = [];
pageText.push(row);
lastY = item.transform[5];
}
item.str = item.str.replace(' ', '*');
row.push(item.str);
}
text.push(...pageText.map(row => row.join(columnSeparator)));
}
doc.destroy();
result.text = text.join(rowSeparator);
return result;
} catch (err) { logger.error(`Error while extracting pdf doc ${err} : File name ${file}`) }
}
export default pdfToText;
I have tried with different version of pdfjs-dist but is doesn't work i couldn't use the lower
Also i'm working with esbuild to bundling the application so i can't use the version lower then 3.11.174 it trow canvas error