QScintilla syntax highlighting with QsciLexerCustom - UTF-8 issue with german characters

129 views Asked by At

Much like this related question, I found myself using QScintilla to create a syntax highlighter that has to deal with non-ASCII characters (é, ä, ß, etc...). I use the trick described in the comments of that question to solve the problem, styling the characters base on the length of the utf-8 bytes rather than the Latin-1 bytes. When styling the entire document, it works fine.

However, my issue arises when using the start/end parameters to only style part of the document as there seems to be a mismatch between the start/end parameters and the actual length of the text being styled. I need to use this as I am dealing with large files that cause a 1-2 second input delay if I continuously style the entire document.

I have the following very simple example:

é

; Comment

When I open the file, which runs the highlighter from start to finish, it looks like that: Right colours

However, if I remove and re-type the comment, the colouring will always be one letter off. Wrong colours

This effect stacks indefinitely, with every non-ASCII character, the colouring goes off by another letter until it is a mess.

I have provided a minimal reproducible example below. All you have to do to notice the problem is start a comment below the last line, you will notice that your styling is always one off.

more wrong colours

import sys
from PyQt6.QtWidgets import *
from PyQt6.QtCore import *
from PyQt6.QtGui import *
from PyQt6.Qsci import *
import re

class MyLexer(QsciLexerCustom):
    def __init__(self, parent):
        super(MyLexer, self).__init__(parent)
        self.setDefaultColor(QColor("#ff000000"))
        self.setDefaultPaper(QColor("#ffffffff"))
        self.setDefaultFont(QFont("Consolas", 14))

        self.setColor(QColor("#ff000000"), 0)   # Style 0: black
        self.setColor(QColor("#ff007f00"), 3)   # Style 3: green

        self.setPaper(QColor("#ffffffff"), 0)   # Style 0: white
        self.setPaper(QColor("#ffffffff"), 3)   # Style 3: white

        self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 0)   # Style 0: Consolas 14pt
        self.setFont(QFont("Consolas", 14, weight=QFont.Weight.Bold), 3)   # Style 3: Consolas 14pt

    def language(self):
        return "SimpleLanguage"

    def description(self, style):
        return str(style)

    def styleText(self, start, end):
        self.startStyling(start)
        text = self.parent().text()[start:end]
        p = re.compile(r"[*]\/|\/[*]|\s+|\w+|\W")

        token_list = [ (token, len(bytearray(token, "utf-8"))) for token in p.findall(text)]

        editor = self.parent()
        apply_until_linebreak = None
        if start > 0:
            previous_style_nr = editor.SendScintilla(editor.SCI_GETSTYLEAT, start - 1)
            if previous_style_nr in [2, 3]:
                apply_until_linebreak = previous_style_nr

        for i, token in enumerate(token_list):
            if apply_until_linebreak is not None:
                if "\n" in token[0]:
                    apply_until_linebreak = None
                    self.setStyling(token[1], 0)
                else:
                    self.setStyling(token[1], apply_until_linebreak)
            else:
                if token[0] in ["/", ";"]:
                    apply_until_linebreak = 3
                    self.setStyling(token[1], 3)
                else:
                    self.setStyling(token[1], 0)

myCodeSample = r"""
This is white

// This is a green comment

This is white again

// This comment has a (ä) special character 
"""

class CustomMainWindow(QMainWindow):
    def __init__(self):
        super(CustomMainWindow, self).__init__()
        self.setGeometry(300, 300, 800, 400)
        self.setWindowTitle("QScintilla Test")

        self.__frm = QFrame(self)
        self.__frm.setStyleSheet("QWidget { background-color: #ffeaeaea }")
        self.__lyt = QVBoxLayout()
        self.__frm.setLayout(self.__lyt)
        self.setCentralWidget(self.__frm)
        self.__myFont = QFont()
        self.__myFont.setPointSize(14)

        self.__editor = QsciScintilla()
        self.__editor.setText(myCodeSample) 
        self.__editor.setLexer(None)            # We install lexer later
        self.__editor.setUtf8(True)             # Set encoding to UTF-8
        self.__editor.setFont(self.__myFont)    # Gets overridden by lexer later on

        self.__lexer = MyLexer(self.__editor)
        self.__editor.setLexer(self.__lexer)

        self.__lyt.addWidget(self.__editor)
        self.show()

if __name__ == '__main__':
    app = QApplication(sys.argv)
    QApplication.setStyle(QStyleFactory.create('Fusion'))
    myGUI = CustomMainWindow()
    sys.exit(app.exec())
0

There are 0 answers