Node convert pdf from express file upload to text

782 views Asked by At

I need upload a pdf to my server and extract the text from the PDF. This is what I have :

const express = require('express');
const fileUpload = require('express-fileupload');

const app = express();

app.use(fileUpload());

app.post('/upload', (req, res) => {
  try {
    let sampleFile = req.files.File;
  } catch (err) {
    res.send(err.message);
  }
});

const PORT = process.env.PORT || 5000;
app.listen(PORT, () => console.log(`server started on port ${PORT}`));

When I log sampleFile its a json. It has a bunch of data with numbers Im not sure how I can get the text from this. The libraries i see that do this already have a file that they point to.

1

There are 1 answers

0
Terry Lennox On BEST ANSWER

If we use the pdf2json module, we can create a parser, then pass the uploaded data to the parseBuffer function.

We can get either access the file object, or get the raw text content using the getRawTextContent() function.

const express = require('express');
const fileUpload = require('express-fileupload');
const PDFParser = require("pdf2json");

const app = express();

app.use(express.static("./"));
app.use(fileUpload());

app.post('/upload', async (req, res) => {
    try {
        let sampleFile = req.files.File;
        console.log("Sample file:", sampleFile);
        let text = await getPDFText(sampleFile.data);
        console.log("PDF Text:", text);
        res.sendStatus(201);
    } catch (err) {
        res.send(err.message);
    }
});

function getPDFText(data) {
    return new Promise((resolve, reject) => {
        const pdfParser = new PDFParser(null, 1);
        pdfParser.on("pdfParser_dataError", reject);
        pdfParser.on("pdfParser_dataReady", pdfData => {
            resolve(pdfParser.getRawTextContent());
        });
        pdfParser.parseBuffer(data);
    });
}

const PORT = process.env.PORT || 5000;
app.listen(PORT, () => console.log(`server started on port ${PORT}`));