I do match of words in a text to retrieve the word offset begin and end. This normally works for both ascii and unicode texts when using an appropriate unicode-aware regex like '(?<=^|\\PL)$1(?=\\PL|$)'
. When I have mixed text (like Korean and English here) there are some problems, while tokenizing:
function aggressive_tokenizer(text) {
// most punctuation
text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");
// commas if followed by space
text = text.replace(/(,\s)/g, " $1");
// single quotes if followed by a space
text = text.replace(/('\s)/g, " $1");
// single quotes if last char
text = text.replace(/('$)/, " $1");
text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
// periods before newline or end of string
text = text.replace(/\. *(\n|$)/g, " . ");
// replace punct
// ignore "-" since may be in slang scream
text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
// finally split remainings into words
text = text.split(/\s+/)
return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
let item = {
"word": token
}
var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
var wordRegex = new XRegExp(pattern.replace('$1', escaped), "g");
// calculate token begin end
var match = null;
while ((match = wordRegex.exec(text)) !== null) {
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
return item;
});
indexes.forEach(index => {
if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
} else {
console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
}
});
<script src="https://unpkg.com/xregexp/xregexp-all.js"></script>
The problem is that I do some cleanup in tokenization like
text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");
where \w
is not unicode-aware, but if I replace it with \p{Alnum}
:
text = text.replace(/([^\p{Alnum}\.\'\-\/\+\<\>,&])/g, " $1 ");
that it should be the equivalent for Unicode word, it does not work properly.
NOTE Please note that I do use XRegExp to support Unicode regex in JavaScript.
UPDATE
According to the comments below, I have updated the code with the modified pattern regexp '(?<=^|\\PL)$1(?=\\PL|$)'
by – Wiktor Stribiżew and replaced XRegExp with built-in RegExp, due to missing support for varied width lookbehind patterns (see comments).
This solution works better, but I have identified an additional case where the char offset begin and end cannot be matched for the given input text: "점점 더 깊이 끌려가"
the output will have a missing offset / match for
{
"index": 2,
"word": "점"
}
function aggressive_tokenizer(text) {
// most punctuation
text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& ");
// commas if followed by space
text = text.replace(/(,\s)/g, " $1");
// single quotes if followed by a space
text = text.replace(/('\s)/g, " $1");
// single quotes if last char
text = text.replace(/('$)/, " $1");
text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
// periods before newline or end of string
text = text.replace(/\. *(\n|$)/g, " . ");
// replace punct
// ignore "-" since may be in slang scream
text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
// finally split remainings into words
text = text.split(/\s+/)
return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
let item = {
"word": token
}
var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");
// calculate token begin end
var match = null;
while ((match = wordRegex.exec(text)) !== null) {
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
return item;
});
indexes.forEach(index => {
if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
console.log("MISSING INDEXES " + index.word);
} else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
} else {
console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
}
});