Replace pandoc Markdown citeproc references in html DOM by citations and add bibliography with citeproc-js

Question

Replace pandoc Markdown citeproc references in html DOM by citations and add bibliography with citeproc-js

119 views Asked by rriemann At 26 May 2023 at 15:04

Dears,

I create compliance reports with nodejs and pug. Historically, we have Markdown templates with Pandoc citerproc blocks that previously were processed by pandoc-citeproc, but now I try to reduce the dependencies and use citeproc-js (or citation.js) instead.

Our document has paragraphs such as:

<p>Cookie consent management is therefore the means through which web service users can <strong>express, reject and withdraw their consent</strong> to accessing or storing information on or related to their device in line with @regulation20181725, art. 37; @eprivacydirective, art. 5(3)</p>
<p>Lorem ipsum [See in this respect @edpb:guidelines012020, paras. 14 and 15] Lorem ipsum [See in this respect @edpb:cookiebannertaskforce2023, paras 2, 23, 24; @edpb:guidelines052020, para. 7]</p>

I have developed the following javacript based on citeproc-js to:

iterate over all p elements and match all brackets […]
iterate over the bib items in the brackets (one or more)
call citeproc-js and replace the bracket by a number
call citeproc-js to generate a bibliography

Unfortunately, my understanding of citationsPre and citationsPost is too poor to understand how I generate the number and the bibliography. (docs)

My code:

let endnoteIndex = 0;
// const regexpGroup = new RegExp(/\[((;?(?:[^;@\]]*)@[^;@\]]+)+)\]/,"g"); // first capture group is content in brackets
const regexpGroup = new RegExp(/\[([^\]\n]*@[^\]\n]+)\]/,"g"); // first capture group is content in brackets
const regexpRef = new RegExp(/(?<prefix>[^@]*)@(?<id>[^,]+)(,(?<post>.*))?/); // capture pretext, id, postext

const style = document.getElementById('citeproc-csl-style').textContent;
const locale = document.getElementById('citeproc-locale').textContent;
const bibliographyArray = JSON.parse(document.getElementById('citeproc-csl-bib').textContent);
let bibliography = {};
for (item of bibliographyArray) {
  bibliography[item.id] = item;
}

const citeprocSys = {
  retrieveLocale: function (lang) {
    return locale;
  },
  retrieveItem: function (id) {
    let item = bibliography[id];
    if(!item) throw new Error(`Bibliography item ${id} not found.`);
    return item;
  }
}

let citeproc = new CSL.Engine(citeprocSys, style);

let citationsPre = [];
let citationsPost = [];

// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace#specifying_a_function_as_the_replacement
function replacer(match, refsString) {
  console.log(refsString);
  let citationItems = refsString.replaceAll('&nbsp;',' ').split(';').map( refString => {
    let refData = refString.trim().match(regexpRef).groups;
    // https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html#cite-items
    let ref = {
      id: refData.id,
      // CSL Locator list https://docs.citationstyles.org/en/stable/specification.html#locators
      /*
      locator: 12,
      label: "page",
      */
      prefix: refData.prefix,
      suffix: refData.suffix
    }
    return ref;
  });
  
  let citations = {
    citationItems: citationItems,
    properties: {
      // In the properties portion of a citation, the noteIndex value indicates the footnote number in which the citation is located within the document.
      // Citations within the main text of the document have a noteIndex of zero.
      noteIndex: 0
    }
  }
  
  let result;
  try {
    result = citeproc.processCitationCluster(citations, citationsPre, citationsPost);
    citationsPre.push(...[]);
    console.log(result);
  } catch (error) {
    console.warn(error);
    return match;
  }
  
  endnoteIndex += 1;
  // return `<a href="#endnote-${endnoteIndex}">[${endnoteIndex}]</a>`;
  return `<a href="#endnote-${endnoteIndex}">${result[1][0][1]}</a>`;
}

function findReferences(paragraph) {
  paragraph.innerHTML = paragraph.innerHTML.replaceAll(regexpGroup, replacer);
}

// find references in all paragraphs
[].forEach.call(document.getElementsByTagName('p'), findReferences);

let bibResult = citeproc.makeBibliography();
console.log("bibresult", bibResult);
document.getElementById('bibliography').innerHTML = bibResult[0].bibstart+bibResult[1].join('\n')+bibResult[0].bibend;

The output bib has always only the last item. What's missing here? Another issue I face is that URLs are plain-text instead of clickable.

Original Q&A

There are 2 answers

LarsW On 29 May 2023 at 22:43

With citationsPre permanently set to [], citeproc-js assumes you are updating the first citation cluster, and removes the publication that was previously there from the internal registry. To fix this, you could (as a start) replace the following code:

citationsPre.push(...[]);

with:

citationsPre.push([
  // result[1] are the citation clusters that have changed,
  // result[1][0] is the first cluster.
  result[1][0][2], // cluster id
  result[1][0][0], // note number
]);

However, that does not fix the code entirely.

Citations that are introduced later may change citation clusters that have been rendered before. This might not apply to endnote styles, but might still affect your code. Namely, result[1] contains all citation clusters that possibly changed. To get the "correct" (new) rendered citation you therefore probably cannot use result[1][0][1], you need to track the citation cluster IDs (result[1][*][2]) in conjunction with citationsPre.
The links will probably not work as expected if a single cluster contains multiple references. You seem to only include one citation per cluster though.
citeproc-js always expects the en-US locale to be available, as it is the final step in locale fallback in CSL.

**rriemann** · Accepted Answer · 2023-06-04T11:02:33+00:00

I managed to get it working. For the record here my solution.

A few comments ahead:

I made this work with the Oscola CSL style. It creates citations meant to be footnotes and does not assume a bibliography in the document. As HTML has no real footnotes, I sided for endnotes. Those endnotes refer to each other (e.g. "{short title} (n. 12)" means the long references is in endnote 12).
The array endnoteArray stores all received notes and I integrate updates coming in from citeproc.processCitationCluster.
citeproc.processCitationCluster calls a method to add to my citation object the property citationID.
in the citationsPre array, I need to start numbering from 1 upwards, because that the endnote number that oscola uses and oscola assumes numbering starts with 1
I replace every finding of a pandoc-formatted citation not with the citation, but with my own counter endnoteIndex, which is like footnote/endnote number. Pandoc+biblatex would do this step automatically when configured for footnote citations (\autocite{ref} with autocite=footnote).

Thanks to LarsW who put me into the right direction.

let endnoteIndex = 0;
let endnoteArray = [];
// const regexpGroup = new RegExp(/\[((;?(?:[^;@\]]*)@[^;@\]]+)+)\]/,"g"); // first capture group is content in brackets
const regexpGroup = new RegExp(/\[([^\]\n]*@[^\]\n]+)\]/,"g"); // first capture group is content in brackets
const regexpRef = new RegExp(/(?<prefix>[^@]*)@(?<id>[^,]+)(,(?<suffix>.*))?/); // capture pretext, id, postext

const style = document.getElementById('citeproc-csl-style').textContent;
const locale = document.getElementById('citeproc-locale').textContent;
const locale_fallback = document.getElementById('citeproc-locale-fallback').textContent;
const bibliographyArray = JSON.parse(document.getElementById('citeproc-csl-bib').textContent);
let bibliography = {};
for (item of bibliographyArray) {
  bibliography[item.id] = item;
}

const citeprocSys = {
  retrieveLocale: function (lang) {
    if(lang == 'en-US') return locale_fallback;
    return locale;
  },
  retrieveItem: function (id) {
    let item = bibliography[id];
    if(!item) throw new Error(`Bibliography item ${id} not found.`);
    return item;
  }
}

let citeproc = new CSL.Engine(citeprocSys, style);

let citationsPre = [];
let citationsPost = [];

// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace#specifying_a_function_as_the_replacement
function replacer(match, refsString) {
  console.log(refsString);
  let citationItems = refsString.replaceAll('&nbsp;',' ').split(';').map( refString => {
    let refData = refString.trim().match(regexpRef).groups;
    // https://citeproc-js.readthedocs.io/en/latest/csl-json/markup.html#cite-items
    let ref = {
      id: refData.id,
      // CSL Locator list https://docs.citationstyles.org/en/stable/specification.html#locators
      /*
      locator: 12,
      label: "page",
      */
      prefix: refData.prefix || undefined,
      suffix: refData.suffix
    }
    return ref;
  });

  let citation = {
    citationItems: citationItems,
    properties: {
      // In the properties portion of a citation, the noteIndex value indicates the footnote number in which the citation is located within the document.
      // Citations within the main text of the document have a noteIndex of zero.
      noteIndex: 0,
    }
  }

  let result;
  try {
    result = citeproc.processCitationCluster(citation, citationsPre, citationsPost);
    // console.log("citationID", citation.citationID);
    citationsPre.push([
      // endnote number starting from 1
      citation.citationID, endnoteIndex+1,
    ]);
    // console.log(JSON.stringify(result[1], null, 2));
  } catch (error) {
    console.warn(error);
    return match;
  }

  result[1].forEach(e => {
    endnoteArray[e[0]] = e[1];
  });
  endnoteIndex += 1;
  // citations here will be numbered and linked as endnotes
  return `<sup><a href="#endnote-${endnoteIndex}" id="ref-${endnoteIndex}">(${endnoteIndex})</a></sup>`;
  // normally (check whether instead of [1][0] another item must be chosen)
  // return `<a href="#endnote-${endnoteIndex}">${result[1][0][1]}</a>`;
}

function findReferences(paragraph) {
  paragraph.innerHTML = paragraph.innerHTML.replaceAll(regexpGroup, replacer);
}

// find references in all paragraphs
[].forEach.call(document.getElementsByTagName('p'), findReferences);

let bibResult = citeproc.makeBibliography();
console.log("bibresult", bibResult);
// citations are added as endnotes
document.getElementById('bibliography').innerHTML = bibResult[0].bibstart+endnoteArray.map((e,i) => `<div id="endnote-${i+1}" class="csl-entry">${e.replaceAll(/&#60;(.*?)&#62;/mg, (match,url) => `<a href="${url}" target="_blank">${url}</a>`).replaceAll(/\((n (\d+))\)/mg, (match,note,ref) => `(<a href="#endnote-${ref}">${note}</a>)`)} <a href="#ref-${i+1}">↩︎</a></div>`).join('\n')+bibResult[0].bibend;
// normally, bibliography is printed
// document.getElementById('bibliography').innerHTML = bibResult[0].bibstart+bibResult[1].join('\n')+bibResult[0].bibend;

TechQA.

Replace pandoc Markdown citeproc references in html DOM by citations and add bibliography with citeproc-js

There are 2 answers

Related Questions in CSL

Related Questions in CITATION-JS

Related Questions in CITEPROC-JS

Popular Questions

Popular Tags

Trending Questions