nvaccess · seanbudd · May 21, 2024 · May 4, 2024 · May 8, 2024 · May 8, 2024
@@ -64,6 +64,7 @@
 import brailleViewer
 from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting
 from utils.security import objectBelowLockScreenAndWindowsIsLocked
+from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter
 import hwIo
 from editableText import EditableText
 
@@ -496,13 +497,36 @@ def update(self):
 		mode = louis.dotsIO
 		if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None:
 			mode |= louis.compbrlAtCursor
-		self.brailleCells, self.brailleToRawPos, self.rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
+		converter: UnicodeNormalizationOffsetConverter | None = None
+		if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(self.rawText):
+			converter = UnicodeNormalizationOffsetConverter(self.rawText)
+			textToTranslate = converter.encoded
+			# Typeforms must be adapted to represent normalized characters.
+			textToTranslateTypeforms = [
+				self.rawTextTypeforms[strOffset] for strOffset in converter.computedEncodedToStrOffsets
+			]
+			# Convert the cursor position to a normalized offset.
+			cursorPos = converter.strToEncodedOffsets(self.cursorPos)
+		else:
+			textToTranslate = self.rawText
+			textToTranslateTypeforms = self.rawTextTypeforms
+			cursorPos = self.cursorPos
+		self.brailleCells, brailleToRawPos, rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
 			[handler.table.fileName, "braille-patterns.cti"],
-			self.rawText,
-			typeform=self.rawTextTypeforms,
+			textToTranslate,
+			typeform=textToTranslateTypeforms,
 			mode=mode,
-			cursorPos=self.cursorPos
+			cursorPos=cursorPos
 		)
+		if converter:
+			# The received brailleToRawPos contains braille to normalized positions.
+			# Process them to represent real raw positions by converting them from normalized ones.
+			brailleToRawPos = [converter.encodedToStrOffsets(i) for i in brailleToRawPos]
+			# The received rawToBraillePos contains normalized to braille positions.
+			# Create a new list based on real raw positions.
+			rawToBraillePos = [rawToBraillePos[i] for i in converter.computedStrToEncodedOffsets]
+		self.brailleToRawPos = brailleToRawPos
+		self.rawToBraillePos = rawToBraillePos
 		if (
 			self.selectionStart is not None
 			and self.selectionEnd is not None

@@ -35,6 +35,7 @@
 	# symbolLevel: One of the characterProcessing.SymbolLevel values.
 	symbolLevel = integer(default=100)
 	trustVoiceLanguage = boolean(default=true)
+	unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
 	includeCLDR = boolean(default=True)
 	beepSpeechModePitch = integer(default=10000,min=50,max=11025)
 	outputDevice = string(default=default)
@@ -82,6 +83,7 @@
 		optionsEnum="ReviewRoutingMovesSystemCaretFlag", behaviorOfDefault="NEVER")
 	readByParagraph = boolean(default=false)
 	wordWrap = boolean(default=true)
+	unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
 	focusContextPresentation = option("changedContext", "fill", "scroll", default="changedContext")
 	interruptSpeechWhileScrolling = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")
 	showSelection = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")

@@ -1589,6 +1589,17 @@ def makeSettings(self, settingsSizer):
 		self.bindHelpEvent("SpeechSettingsTrust", self.trustVoiceLanguageCheckbox)
 		self.trustVoiceLanguageCheckbox.SetValue(config.conf["speech"]["trustVoiceLanguage"])
 
+		self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = settingsSizerHelper.addLabeledControl(
+			labelText=_(
+				# Translators: This is a label for a combo-box in the Braille settings panel.
+				"Unicode normali&zation"
+			),
+			wxCtrlClass=nvdaControls.FeatureFlagCombo,
+			keyPath=["speech", "unicodeNormalization"],
+			conf=config.conf,
+		)
+		self.bindHelpEvent("SpeechUnicodeNormalization", self.unicodeNormalizationCombo)
+
 		includeCLDRText = _(
 			# Translators: This is the label for a checkbox in the
 			# voice settings panel (if checked, data from the unicode CLDR will be used
@@ -1701,6 +1712,7 @@ def onSave(self):
 			self.symbolLevelList.GetSelection()
 		].value
 		config.conf["speech"]["trustVoiceLanguage"] = self.trustVoiceLanguageCheckbox.IsChecked()
+		self.unicodeNormalizationCombo.saveCurrentValueToConf()
 		currentIncludeCLDR = config.conf["speech"]["includeCLDR"]
 		config.conf["speech"]["includeCLDR"] = newIncludeCldr = self.includeCLDRCheckbox.IsChecked()
 		if currentIncludeCLDR is not newIncludeCldr:
@@ -4145,6 +4157,17 @@ def makeSettings(self, settingsSizer):
 		self.bindHelpEvent("BrailleSettingsWordWrap", self.wordWrapCheckBox)
 		self.wordWrapCheckBox.Value = config.conf["braille"]["wordWrap"]
 
+		self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
+			labelText=_(
+				# Translators: This is a label for a combo-box in the Braille settings panel.
+				"Unicode normali&zation"
+			),
+			wxCtrlClass=nvdaControls.FeatureFlagCombo,
+			keyPath=["braille", "unicodeNormalization"],
+			conf=config.conf,
+		)
+		self.bindHelpEvent("BrailleUnicodeNormalization", self.unicodeNormalizationCombo)
+
 		self.brailleInterruptSpeechCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
 			labelText=_(
 				# Translators: This is a label for a combo-box in the Braille settings panel.
@@ -4184,6 +4207,7 @@ def onSave(self):
 		self.brailleReviewRoutingMovesSystemCaretCombo.saveCurrentValueToConf()
 		config.conf["braille"]["readByParagraph"] = self.readByParagraphCheckBox.Value
 		config.conf["braille"]["wordWrap"] = self.wordWrapCheckBox.Value
+		self.unicodeNormalizationCombo.saveCurrentValueToConf()
 		config.conf["braille"]["focusContextPresentation"] = self.focusContextPresentationValues[self.focusContextPresentationList.GetSelection()]
 		self.brailleInterruptSpeechCombo.saveCurrentValueToConf()
 		self.brailleShowSelectionCombo.saveCurrentValueToConf()

@@ -25,6 +25,7 @@
 import speechDictHandler
 import characterProcessing
 import languageHandler
+from textUtils import unicodeNormalize
 from . import manager
 from .extensions import speechCanceled, pre_speechCanceled, pre_speech
 from .extensions import filter_speechSequence, speechCanceled
@@ -1568,6 +1569,8 @@ def getTextInfoSpeech(  # noqa: C901
 					# There was content after the indentation, so there is no more indentation.
 					indentationDone=True
 			if command:
+				if config.conf["speech"]["unicodeNormalization"]:
+					command = unicodeNormalize(command)
 				if inTextChunk:
 					relativeSpeechSequence[-1]+=command
 				else:
@@ -1775,7 +1778,7 @@ def getPropertiesSpeech(  # noqa: C901
 		reason: OutputReason = OutputReason.QUERY,
 		**propertyValues
 ) -> SpeechSequence:
-	textList: List[str] = []
+	textList: SpeechSequence = []
 	name: Optional[str] = propertyValues.get('name')
 	if name:
 		textList.append(name)
@@ -1968,7 +1971,11 @@ def getPropertiesSpeech(  # noqa: C901
 	errorMessage: str | None = propertyValues.get("errorMessage", None)
 	if errorMessage:
 		textList.append(errorMessage)
-
+	if config.conf["speech"]["unicodeNormalization"]:
+		textList = [
+			unicodeNormalize(t) if isinstance(t, str) else t
+			for t in textList
+		]
 	types.logBadSequenceTypes(textList)
 	return textList
 

@@ -1,30 +1,32 @@
-# -*- coding: UTF-8 -*-
 # A part of NonVisual Desktop Access (NVDA)
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
-# Copyright (C) 2018-2021 NV Access Limited, Babbage B.V., Łukasz Golonka
+# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka
 
 """
 Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
 """
 
-import encodings
-import sys
 import ctypes
-from collections.abc import ByteString
-from typing import Tuple, Optional, Type
+import encodings
 import locale
+import unicodedata
+from abc import ABCMeta, abstractmethod, abstractproperty
+from collections import defaultdict
+from difflib import ndiff
+from functools import cached_property
+from typing import Optional, Tuple, Type
+
 from logHandler import log
-from abc import abstractmethod
 
 WCHAR_ENCODING = "utf_16_le"
 UTF8_ENCODING = "utf-8"
 USER_ANSI_CODE_PAGE = locale.getpreferredencoding()
 
 
-class OffsetConverter:
+class OffsetConverter(metaclass=ABCMeta):
 	decoded: str
-	
+
 	def __init__(self, text: str):
 		if not isinstance(text, str):
 			raise TypeError("Value must be of type str")
@@ -33,7 +35,7 @@ def __init__(self, text: str):
 	def __repr__(self):
 		return f"{self.__class__.__name__}({repr(self.decoded)})"
 
-	@property
+	@abstractproperty
 	def encodedStringLength(self) -> int:
 		"""Returns the length of the string in itssubclass-specific encoded representation."""
 		raise NotImplementedError
@@ -385,8 +387,6 @@ class IdentityOffsetConverter(OffsetConverter):
 		This is a dummy converter that assumes 1:1 correspondence between encoded and decoded characters.
 	"""
 
-	_encoding: str = UTF8_ENCODING
-
 	def __init__(self, text: str):
 		super().__init__(text)
 
@@ -417,6 +417,157 @@ def encodedToStrOffsets(
 		return (encodedStart, encodedEnd)
 
 
+DEFAULT_UNICODE_NORMALIZATION_ALGORITHM = "NFKC"
+
+
+class UnicodeNormalizationOffsetConverter(OffsetConverter):
+	"""
+	Object that holds a string in both its decoded and its unicode normalized form.
+	The object allows for easy conversion between offsets in strings which may or may not be normalized,
+
+	For example, when using the NFKC algorithm, the "ĳ" ligature normalizes to "ij",
+	which takes two characters instead of one.
+	"""
+	normalizationForm: str
+	computedStrToEncodedOffsets: tuple[int]
+	computedEncodedToStrOffsets: tuple[int]
+
+	def __init__(self, text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM):
+		super().__init__(text)
+		self.normalizationForm = normalizationForm
+		self.encoded: str = unicodedata.normalize(normalizationForm, text)
+		self.computedStrToEncodedOffsets, self.computedEncodedToStrOffsets = self._calculateOffsets()
+
+	def _calculateOffsets(self) -> tuple[tuple[int], tuple[int]]:
+		diff = list(ndiff(self.decoded, self.encoded))
+		diff.append("!")  # Closing the diff
+		iOrigin = iNormalized = 0
+		originBuffer = ""
+		normalizedBuffer = ""
+		originToNormalizedDict = defaultdict(list)
+		normalizedToOriginDict = defaultdict(list)
+		originPending = normalizedPending = False
+		for char in diff:
+			if char[0] == "?":
+				raise RuntimeError("Unexpected entry in diff")
+			elif char[0] == "-":
+				originBuffer += char[2:]
+				originPending = True
+			elif char[0] == "+":
+				normalizedBuffer += char[2:]
+				normalizedPending = True
+			elif char[0] == " " and (
+				(not originPending and normalizedPending) or (originPending and not normalizedPending)
+			):
+				originBuffer += char[2:]
+				normalizedBuffer += char[2:]
+			else:
+				while originBuffer and normalizedBuffer:
+					originPart = ""
+					originPartLen = 0
+					normalizedPart = ""
+					normalizedPartLen = 0
+					for i in range(len(originBuffer)):
+						originPart = originBuffer[: (i + 1)]
+						normalizedPart = unicodedata.normalize(self.normalizationForm, originPart)
+						if (
+							originPart == normalizedPart
+							or not normalizedBuffer.startswith(normalizedPart)
+						):
+							continue
+						originPartLen = len(originPart)
+						originBuffer = originBuffer[originPartLen:]
+						normalizedPartLen = len(normalizedPart)
+						normalizedBuffer = normalizedBuffer[normalizedPartLen:]
+						break
+					originMultiplier = min(originPartLen / normalizedPartLen, 1)
+					normalizedMultiplier = min(normalizedPartLen / originPartLen, 1)
+					for i in range(max(originPartLen, normalizedPartLen)):
+						tempOrigin = iOrigin + int(i * originMultiplier)
+						tempNormalized = iNormalized + int(i * normalizedMultiplier)
+						originC = originPart[i] if i < originPartLen else None
+						if originC:
+							normalizedIndex = normalizedPart.find(originC)
+							if normalizedIndex != -1:
+								tempNormalized = iNormalized + normalizedIndex
+						normalizedC = normalizedPart[i] if i < normalizedPartLen else None
+						if normalizedC:
+							originIndex = originPart.find(normalizedC)
+							if originIndex != -1:
+								tempOrigin = iOrigin + originIndex
+						originToNormalizedDict[tempOrigin].append(tempNormalized)
+						normalizedToOriginDict[tempNormalized].append(tempOrigin)
+					iOrigin += originPartLen
+					iNormalized += normalizedPartLen
+				originPending = normalizedPending = False
+				if char[0] == " ":
+					originToNormalizedDict[iOrigin].append(iNormalized)
+					normalizedToOriginDict[iNormalized].append(iOrigin)
+					iOrigin += 1
+					iNormalized += 1
+		originResult = tuple(map(min, originToNormalizedDict.values()))
+		assert len(originResult) == len(self.decoded)
+		normalizedResult = tuple(map(min, normalizedToOriginDict.values()))
+		assert len(normalizedResult) == len(self.encoded)
+		return tuple((
-		return tuple((
+		return (
-		return tuple((
+		return (
+			originResult,
+			normalizedResult
+		))
+
+	@cached_property
+	def encodedStringLength(self) -> int:
+		"""Returns the length of the string in its normalized representation."""
+		return len(self.encoded)
+
+	def strToEncodedOffsets(
+			self,
+			strStart: int,
+			strEnd: int | None = None,
+			raiseOnError: bool = False,
+	) -> int | Tuple[int]:
+		super().strToEncodedOffsets(strStart, strEnd, raiseOnError)
+		if strStart == 0:
+			resultStart = 0
+		else:
+			resultStart = self.computedStrToEncodedOffsets[strStart]
+		if strEnd is None:
+			return resultStart
+		elif strStart == strEnd:
+			return (resultStart, resultStart)
+		else:
+			resultEnd = self.computedStrToEncodedOffsets[strEnd]
+			return (resultStart, resultEnd)
+
+	def encodedToStrOffsets(
+			self,
+			encodedStart: int,
+			encodedEnd: int | None = None,
+			raiseOnError: bool = False
+	) -> int | Tuple[int]:
+		super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError)
+		if encodedStart == 0:
+			resultStart = 0
+		else:
+			resultStart = self.computedEncodedToStrOffsets[encodedStart]
+		if encodedEnd is None:
+			return resultStart
+		elif encodedStart == encodedEnd:
+			return (resultStart, resultStart)
+		else:
+			resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
+			return (resultStart, resultEnd)
+
+
+def isUnicodeNormalized(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> bool:
+	"""Convenience function to wrap unicodedata.is_normalized with a default normalization form."""
+	return unicodedata.is_normalized(normalizationForm, text)
+
+
+def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> str:
+	"""Convenience function to wrap unicodedata.normalize with a default normalization form."""
+	return unicodedata.normalize(normalizationForm, text)
+
+
 ENCODINGS_TO_CONVERTERS: dict[str, Type[OffsetConverter]] = {
 	WCHAR_ENCODING: WideStringOffsetConverter,
 	UTF8_ENCODING: UTF8OffsetConverter,