PDFNet: whitespace is added when extracting text from a Rect

283 views Asked by At

I need to extract the text from a highlight in a pdf with the PDFTron library for C# (PdfNet)

I create highlights with a Helper class that contains these method to create a highlight from a textseelction:

private pdftron.SDF.Obj CreateHighlightAppearance(pdftron.PDF.Rect bbox)
{
    ElementBuilder build = new ElementBuilder();
    ElementWriter writer = new ElementWriter();
    writer.Begin(m_document);

    // Draw background 
    Element element = build.CreateRect(bbox.x1 - 2, bbox.y1, bbox.x2 + 2, bbox.y2); 
    element.SetPathFill(true);
    element.SetPathStroke(false);
    GState gs = element.GetGState();
    gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
    gs.SetBlendMode(GState.BlendMode.e_bl_multiply);
    writer.WriteElement(element);
    pdftron.SDF.Obj stm = writer.End();

    build.Dispose();
    writer.Dispose();

    // Set the bounding box 
    stm.PutRect("BBox", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
    stm.PutName("Subtype", "Form");
    return stm;
}

public Annot CreateHighlightAnnot(pdftron.PDF.Rect rect)
{
    Annot a = Annot.Create(m_document, Annot.Type.e_Highlight, rect);
    a.SetAppearance(CreateHighlightAppearance(rect));

    pdftron.SDF.Obj quads = a.GetSDFObj().PutArray("QuadPoints");
    quads.PushBackNumber(rect.x1);
    quads.PushBackNumber(rect.y2);
    quads.PushBackNumber(rect.x2);
    quads.PushBackNumber(rect.y2);
    quads.PushBackNumber(rect.x1);
    quads.PushBackNumber(rect.y1);
    quads.PushBackNumber(rect.x2);
    quads.PushBackNumber(rect.y1);

    return a;
 }

public void AddHighlights()
{
    PDFViewCtrl.Selection selection = m_pdfViewer.GetSelection();

    int pageIndex = m_pdfViewer.GetCurrentPage();
    pdftron.PDF.Page page = m_pdfViewer.GetDoc().GetPage(pageIndex);

    if (m_document != null)
    {
        int pageNumber = selection.GetPageNum(); 
        double[] quads = selection.GetQuads();
        int numQuads = quads.Length / 8;

        if (quads.Length % 8 == 0) //must have at least 8 points to be valid
        {
            for (int i = 0; i < numQuads; i++)
            {
                Rect selectionRect = GetSelectionRect(ref quads, i);
                //Console.WriteLine("GetRectsFromQuads - aRect: " + rectX1.ToString() + " | " + rectY1.ToString() + " | " + rectX2.ToString() + " | " + rectY2.ToString());

                Annot highlightAnnot = CreateHighlightAnnot(selectionRect);

                //remove any underlying highlight, to work with different colors
                m_pdfViewer.RemoveHighlightAnnotationFromPage(highlightAnnot.GetRect(), pageNumber);                        
                m_pdfViewer.AddHighlightAnnotationToPage(highlightAnnot, true);
            }

            m_pdfViewer.SetDocumentModified();
            m_pdfViewer.ClearSelection();
        }
    }
}

This is the method I use to extract the text. The problem is that there's always a whitespace added at the end of contentStr, which is annoying because whitespaces are not always highlighted

private string GetTextFromRect(Rect rect, pdftron.PDF.Page page, Annot annot)
{
    string contentStr = "";
    TextExtractor txtExtractor = new TextExtractor();
    txtExtractor.Begin(page, rect);
    contentStr += txtExtractor.GetTextUnderAnnot(annot);
    return contentStr;
}
0

There are 0 answers