How to convert Images in pdf files to text using java

1k views Asked by At

Kindly help me, How to convert Images in pdf files to text using java. I am using itext txttopdf library using maven dependency. Here is my code. It successfully converts text in pdf file to text file.

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

public class ExtractPageContent {

    public static final String PREFACE = "C:/Latest Maven Code/pdttotextconvertor/target/classes/SCORE.pdf";
    public static final String RESULT = "C:/Latest Maven Code/pdttotextconvertor/target/classes/SCORE.txt";

    public void parsePdf(String pdf, String txt) throws IOException {
        PdfReader reader = new PdfReader(pdf);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        PrintWriter out = new PrintWriter(new FileOutputStream(txt));
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
            strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
            out.println(strategy.getResultantText());
        }
        reader.close();
        out.flush();
        out.close();
    }

    public static void main(String[] args) throws IOException {
        ExtractPageContent extractPageContent = new ExtractPageContent(); 
        extractPageContent.parsePdf(PREFACE, RESULT);
        System.out.println("DONE");
    }
}

Maven Dependency added is:

<dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itextpdf</artifactId>
            <version>5.5.1</version>
    </dependency>

Please help. Thanks in advance.

0

There are 0 answers