JS Regex for Parsing SGF (Meta)data

110 views Asked by At

SGF is what's widely used to save Go (board game) games as text. One example of how it saves its data is this — as a whole, SGF is a text-based representation of a trie, which means it is recursive, but this question is regarding only the data in each node —:

GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25]

Data is saved as pairs of strings plus data within brackets ([]). The keys are usually of length 2, but not necessarily so; moves are encoded as B[aa] or W[bb], where B denotes Black, and W is White, while aa and bb are the coordinates on the board.

My question is: what would then be the regex to extract these data groups as key-value objects? (I'm using JavaScript or TypeScript here.)


References

1

There are 1 answers

13
InSync On BEST ANSWER

1. If only trying to match the (meta)data

Assuming only the simplest rules, the regex that you (might be) looking for is:

/(?<key>[A-Z]+)\[(?<value>(?:\\\]|[^\]])*)\]/gy

...where the g flag stands for "global" (matches all instances) and y for "sticky" (each match either starts at the very first character or right after the end of the last match). If you are 120% confident that the content is valid, feel free to drop the y.

Try it:

console.config({ maximize: true });

const properties = /(?<key>[A-Z]+)\[(?<value>(?:\\\]|[^\]])*)\]/g;

const testcases = [
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd];B[pq];W[dp])',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd](;B[pq];W[dp])(;B[dp];W[pp]))',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd](;B[pq];W[dp])(;B[dp];W[pp];PL[B]AE[jk]AB[jj]AW[ji];B[jq]))',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd]C[Comment on move.];W[dd](;B[pq];W[dp])(;B[dp];W[pp];PL[B]AE[jk]AB[jj]AW[ji]C[Comment on editing.];B[jq]))'
];

for (const testcase of testcases) {
  console.log(
    [...testcase.matchAll(properties)].map(
      ({ groups: { key, value } }) => ({ key, value })
    )
  );
}
<script src="https://gh-canon.github.io/stack-snippet-console/console.min.js"></script>

2. On the Full Grammar

According to the second link, this is the full SGF grammar:

Collection     = { GameTree }
GameTree       = "(" RootNode NodeSequence { Tail } ")"
Tail           = "(" NodeSequence { Tail } ")"
NodeSequence   = { Node }
RootNode       = Node
Node           = ";" { Property }
Property       = PropIdent PropValue { PropValue }
PropIdent      = UcLetter { UcLetter }
PropValue      = "[" Value "]"
UcLetter       = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
                 "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
                 "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"

Rule Tail is recursive, which means it is impossible to rewrite a rule depending on it (Collection, GameTree and itself) as an ECMAScript-compliant regex.

PCRE, on the other hand, has support for recursive pattern:

(?(DEFINE)
  (?<Collection>
    (?:(?&GameTree)(?:\s*(?&GameTree))*)?
  )
  (?<GameTree>
    \(\s*
    (?&RootNode)\s*
    (?&NodeSequence)\s*
    (?:(?&Tail)(?:\s*(?&Tail))*)?\s*
    \)
  )
  (?<Tail>
    \(\s*
    (?&NodeSequence)\s*
    (?:(?&Tail)(?:\s*(?&Tail))*)?\s*
    \)
  )
  (?<NodeSequence>
    (?:(?&Node)(?:\s*(?&Node))*)?
  )
  (?<RootNode> (?&Node) )
  (?<Node> ;\s*(?:(?&Property)(?:\s*(?&Property))*)? )
  (?<Property>
    (?&PropIdent)\s*
    (?&PropValue)(?:\s*(?&PropValue))*
  )
  (?<PropIdent> [A-Z](?:\s*[A-Z])* )
  (?<PropValue> \[ (?&Value) \])
  (?<Value> (?:\\\]|[^\]])* )
)

Try it on regex101.com.


Assuming Tail cannot contain itself (i.e. Tail = "(" NodeSequence ")"), the GameTree rule can be rewritten as (~1100 characters):

/\(\s*;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?\s*(?:;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?(?:\s*;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?)*)?\s*(?:\((?:;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?(?:\s*;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?)*)?\)(?:\((?:;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?(?:\s*;(?:[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*(?:\s*[A-Z](?:\s*[A-Z])*\s*\[(?:\\\]|[^\]])*\](?:\s*\[(?:\\\]|[^\]])*\])*)*)?)*)?\))*)?\s*\)/

Try it on regex101.com.

This only ever validates the content, giving you no groups. However, even if groups were added (e.g. \(\s*(?<rootNode>...)(?<nodeSequence>...)(?<tails>...)\s*\)), they would have to be re-parsed using the other rules.

That said, assuming non-recursive structure, a parser using pure regex would repeat itself a lot.

Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems.

— Jamie Zawinski (Coding Horror blog post)

Code used to generate the monster above:

const rules = (() => {
  const value = String.raw`(?:\\\]|[^\]])*`;
  const propIdent = String.raw`[A-Z](?:\s*[A-Z])*`;
  const propValue = String.raw`\[${value}\]`;
  const property = String.raw`
    ${propIdent}\s*
    ${propValue}(?:\s*${propValue})*
  `;
  
  const node = String.raw`;(?:${property}(?:\s*${property})*)?`;
  const rootNode = node;
  const nodeSequence = String.raw`(?:${node}(?:\s*${node})*)?`;
  const tail = String.raw`\(${nodeSequence}\)`;
  
  const gameTree = String.raw`
    \(\s*
    ${rootNode}\s*
    ${nodeSequence}\s*
    (?:${tail}(?:${tail})*)?\s*
    \)
  `;
  const collection = String.raw`(?:${gameTree}(?:\s*${gameTree})*)?`;
  
  const rules = {
    collection, gameTree,
    tail, nodeSequence, rootNode, node,
    property, propIdent, propValue, value
  };

  for (const [key, value] of Object.entries(rules)) {
    rules[key] = new RegExp(value.replace(/\s+/g, ''));
  }
  
  return rules;
})();

Try it:

console.config({ maximize: true });

const rules = (() => {
  const value = String.raw`(?:\\\]|[^\]])*`;
  const propIdent = String.raw`[A-Z](?:\s*[A-Z])*`;
  const propValue = String.raw`\[${value}\]`;
  const property = String.raw`
    ${propIdent}\s*
    ${propValue}(?:\s*${propValue})*
  `;
  
  const node = String.raw`;(?:${property}(?:\s*${property})*)?`;
  const rootNode = node;
  const nodeSequence = String.raw`(?:${node}(?:\s*${node})*)?`;
  const tail = String.raw`\(${nodeSequence}\)`;
  
  const gameTree = String.raw`
    \(\s*
    ${rootNode}\s*
    ${nodeSequence}\s*
    (?:${tail}(?:${tail})*)?\s*
    \)
  `;
  const collection = String.raw`(?:${gameTree}(?:\s*${gameTree})*)?`;
  
  const rules = {
    collection, gameTree,
    tail, nodeSequence, rootNode, node,
    property, propIdent, propValue, value
  };

  for (const [key, value] of Object.entries(rules)) {
    rules[key] = new RegExp(value.replace(/\s+/g, ''));
  }
  
  return rules;
})();

console.log(rules);

const testcases = [
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd];B[pq];W[dp])',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd](;B[pq];W[dp])(;B[dp];W[pp]))',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd];W[dd](;B[pq];W[dp])(;B[dp];W[pp];PL[B]AE[jk]AB[jj]AW[ji];B[jq]))',
  '(;GM[1]FF[4]CA[UTF-8]AP[Sabaki:0.52.2]KM[6.5]SZ[19]DT[2023-12-25];B[pd]C[Comment on move.];W[dd](;B[pq];W[dp])(;B[dp];W[pp];PL[B]AE[jk]AB[jj]AW[ji]C[Comment on editing.];B[jq]))'
];

for (const testcase of testcases) {
  console.log(testcase.match(rules.gameTree));
}
<script src="https://gh-canon.github.io/stack-snippet-console/console.min.js"></script>