I need to extract the text from a highlight in a pdf with the PDFTron library for C# (PdfNet)
I create highlights with a Helper class that contains these method to create a highlight from a textseelction:
private pdftron.SDF.Obj CreateHighlightAppearance(pdftron.PDF.Rect bbox)
{
ElementBuilder build = new ElementBuilder();
ElementWriter writer = new ElementWriter();
writer.Begin(m_document);
// Draw background
Element element = build.CreateRect(bbox.x1 - 2, bbox.y1, bbox.x2 + 2, bbox.y2);
element.SetPathFill(true);
element.SetPathStroke(false);
GState gs = element.GetGState();
gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
gs.SetBlendMode(GState.BlendMode.e_bl_multiply);
writer.WriteElement(element);
pdftron.SDF.Obj stm = writer.End();
build.Dispose();
writer.Dispose();
// Set the bounding box
stm.PutRect("BBox", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
stm.PutName("Subtype", "Form");
return stm;
}
public Annot CreateHighlightAnnot(pdftron.PDF.Rect rect)
{
Annot a = Annot.Create(m_document, Annot.Type.e_Highlight, rect);
a.SetAppearance(CreateHighlightAppearance(rect));
pdftron.SDF.Obj quads = a.GetSDFObj().PutArray("QuadPoints");
quads.PushBackNumber(rect.x1);
quads.PushBackNumber(rect.y2);
quads.PushBackNumber(rect.x2);
quads.PushBackNumber(rect.y2);
quads.PushBackNumber(rect.x1);
quads.PushBackNumber(rect.y1);
quads.PushBackNumber(rect.x2);
quads.PushBackNumber(rect.y1);
return a;
}
public void AddHighlights()
{
PDFViewCtrl.Selection selection = m_pdfViewer.GetSelection();
int pageIndex = m_pdfViewer.GetCurrentPage();
pdftron.PDF.Page page = m_pdfViewer.GetDoc().GetPage(pageIndex);
if (m_document != null)
{
int pageNumber = selection.GetPageNum();
double[] quads = selection.GetQuads();
int numQuads = quads.Length / 8;
if (quads.Length % 8 == 0) //must have at least 8 points to be valid
{
for (int i = 0; i < numQuads; i++)
{
Rect selectionRect = GetSelectionRect(ref quads, i);
//Console.WriteLine("GetRectsFromQuads - aRect: " + rectX1.ToString() + " | " + rectY1.ToString() + " | " + rectX2.ToString() + " | " + rectY2.ToString());
Annot highlightAnnot = CreateHighlightAnnot(selectionRect);
//remove any underlying highlight, to work with different colors
m_pdfViewer.RemoveHighlightAnnotationFromPage(highlightAnnot.GetRect(), pageNumber);
m_pdfViewer.AddHighlightAnnotationToPage(highlightAnnot, true);
}
m_pdfViewer.SetDocumentModified();
m_pdfViewer.ClearSelection();
}
}
}
This is the method I use to extract the text. The problem is that there's always a whitespace added at the end of contentStr
, which is annoying because whitespaces are not always highlighted
private string GetTextFromRect(Rect rect, pdftron.PDF.Page page, Annot annot)
{
string contentStr = "";
TextExtractor txtExtractor = new TextExtractor();
txtExtractor.Begin(page, rect);
contentStr += txtExtractor.GetTextUnderAnnot(annot);
return contentStr;
}