Space getting trim while extracting the PDF document using "pdfjs-dist" ( pdfjs ) package in nodejs Application

159 views Asked by At

while extracting the pdf document into text i'm getting an issue, its removing extra space and converting into single space Attached the pdf doc image for ref - enter image description here.

PDF data - "Cycle started Quick, 01/09/2023 12:03"

Example - "Cycle0started0000Quick,001/09/20230000012:03" after extracting into text "Cycle0started0Quick,001/09/2023012:03"

in the above example i represented spaces from 0

current version witch having the space issue is

"pdfjs-dist": "3.11.174"

The version i was using previously which doesn't have space issue

"pdfjs-dist": "2.11.338",

I'm using the below code to extracting the PDF document

import pdfjs from 'pdfjs-dist/build/pdf.js';
import * as pdfWorker from 'pdfjs-dist/build/pdf.worker.js';
import logger from './utils/logger.js';
import fs from 'fs/promises';

pdfjs.GlobalWorkerOptions.workerSrc = pdfWorker;

const pdfToText = async function ({ file, dataBuffer, startPage = 1, endPage = Number.MAX_VALUE, columnSeparator = '', rowSeparator = '\n', renderOptions }) {
    try {
        if (file) {
            dataBuffer = Uint8Array.from(await fs.readFile(file));
        } else {
            dataBuffer = Uint8Array.from(dataBuffer);
        }
        const doc = await pdfjs.getDocument(dataBuffer).promise;
        const result = {
            version: pdfjs.version,
            numPages: doc.numPages,
            metaData: await doc.getMetadata(),
        }
        result.info = result.metaData.info;

        endPage = Math.min(endPage, doc.numPages);

        const text = [];

        for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
            const page = await doc.getPage(pageNumber);

            const textContent = await page.getTextContent(renderOptions);
            let lastY, row = [];
            const pageText = [];
            for (const item of textContent.items) {
                if (lastY !== item.transform[5]) {
                    row = [];
                    pageText.push(row);
                    lastY = item.transform[5];
                }
                item.str = item.str.replace(' ', '*');
                row.push(item.str);
            }

            text.push(...pageText.map(row => row.join(columnSeparator)));
        }
        doc.destroy();
        result.text = text.join(rowSeparator);

        return result;
    } catch (err) { logger.error(`Error while extracting pdf doc ${err} : File name  ${file}`) }
}

export default pdfToText;

I have tried with different version of pdfjs-dist but is doesn't work i couldn't use the lower

Also i'm working with esbuild to bundling the application so i can't use the version lower then 3.11.174 it trow canvas error

0

There are 0 answers