Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Unicode Normalization to speech and braille #16521

Merged
Merged
32 changes: 28 additions & 4 deletions source/braille.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import brailleViewer
from autoSettingsUtils.driverSetting import BooleanDriverSetting, NumericDriverSetting
from utils.security import objectBelowLockScreenAndWindowsIsLocked
from textUtils import isUnicodeNormalized, UnicodeNormalizationOffsetConverter
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
import hwIo
from editableText import EditableText

Expand Down Expand Up @@ -496,13 +497,36 @@ def update(self):
mode = louis.dotsIO
if config.conf["braille"]["expandAtCursor"] and self.cursorPos is not None:
mode |= louis.compbrlAtCursor
self.brailleCells, self.brailleToRawPos, self.rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
converter: UnicodeNormalizationOffsetConverter | None = None
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
if config.conf["braille"]["unicodeNormalization"] and not isUnicodeNormalized(self.rawText):
converter = UnicodeNormalizationOffsetConverter(self.rawText)
textToTranslate = converter.encoded
# Typeforms must be adapted to represent normalized characters.
textToTranslateTypeforms = [
self.rawTextTypeforms[strOffset] for strOffset in converter.computedEncodedToStrOffsets
]
# Convert the cursor position to a normalized offset.
cursorPos = converter.strToEncodedOffsets(self.cursorPos)
else:
textToTranslate = self.rawText
textToTranslateTypeforms = self.rawTextTypeforms
cursorPos = self.cursorPos
self.brailleCells, brailleToRawPos, rawToBraillePos, self.brailleCursorPos = louisHelper.translate(
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
[handler.table.fileName, "braille-patterns.cti"],
self.rawText,
typeform=self.rawTextTypeforms,
textToTranslate,
typeform=textToTranslateTypeforms,
mode=mode,
cursorPos=self.cursorPos
cursorPos=cursorPos
)
if converter:
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
# The received brailleToRawPos contains braille to normalized positions.
# Process them to represent real raw positions by converting them from normalized ones.
brailleToRawPos = [converter.encodedToStrOffsets(i) for i in brailleToRawPos]
# The received rawToBraillePos contains normalized to braille positions.
# Create a new list based on real raw positions.
rawToBraillePos = [rawToBraillePos[i] for i in converter.computedStrToEncodedOffsets]
self.brailleToRawPos = brailleToRawPos
self.rawToBraillePos = rawToBraillePos
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
if (
self.selectionStart is not None
and self.selectionEnd is not None
Expand Down
2 changes: 2 additions & 0 deletions source/config/configSpec.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# symbolLevel: One of the characterProcessing.SymbolLevel values.
symbolLevel = integer(default=100)
trustVoiceLanguage = boolean(default=true)
unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
includeCLDR = boolean(default=True)
beepSpeechModePitch = integer(default=10000,min=50,max=11025)
outputDevice = string(default=default)
Expand Down Expand Up @@ -82,6 +83,7 @@
optionsEnum="ReviewRoutingMovesSystemCaretFlag", behaviorOfDefault="NEVER")
readByParagraph = boolean(default=false)
wordWrap = boolean(default=true)
unicodeNormalization = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="disabled")
focusContextPresentation = option("changedContext", "fill", "scroll", default="changedContext")
interruptSpeechWhileScrolling = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")
showSelection = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")
Expand Down
24 changes: 24 additions & 0 deletions source/gui/settingsDialogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1589,6 +1589,17 @@ def makeSettings(self, settingsSizer):
self.bindHelpEvent("SpeechSettingsTrust", self.trustVoiceLanguageCheckbox)
self.trustVoiceLanguageCheckbox.SetValue(config.conf["speech"]["trustVoiceLanguage"])

self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = settingsSizerHelper.addLabeledControl(
labelText=_(
# Translators: This is a label for a combo-box in the Braille settings panel.
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
"Unicode normali&zation"
),
wxCtrlClass=nvdaControls.FeatureFlagCombo,
keyPath=["speech", "unicodeNormalization"],
conf=config.conf,
)
self.bindHelpEvent("SpeechUnicodeNormalization", self.unicodeNormalizationCombo)

seanbudd marked this conversation as resolved.
Show resolved Hide resolved
includeCLDRText = _(
# Translators: This is the label for a checkbox in the
# voice settings panel (if checked, data from the unicode CLDR will be used
Expand Down Expand Up @@ -1701,6 +1712,7 @@ def onSave(self):
self.symbolLevelList.GetSelection()
].value
config.conf["speech"]["trustVoiceLanguage"] = self.trustVoiceLanguageCheckbox.IsChecked()
self.unicodeNormalizationCombo.saveCurrentValueToConf()
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
currentIncludeCLDR = config.conf["speech"]["includeCLDR"]
config.conf["speech"]["includeCLDR"] = newIncludeCldr = self.includeCLDRCheckbox.IsChecked()
if currentIncludeCLDR is not newIncludeCldr:
Expand Down Expand Up @@ -4145,6 +4157,17 @@ def makeSettings(self, settingsSizer):
self.bindHelpEvent("BrailleSettingsWordWrap", self.wordWrapCheckBox)
self.wordWrapCheckBox.Value = config.conf["braille"]["wordWrap"]

self.unicodeNormalizationCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
labelText=_(
# Translators: This is a label for a combo-box in the Braille settings panel.
"Unicode normali&zation"
),
wxCtrlClass=nvdaControls.FeatureFlagCombo,
keyPath=["braille", "unicodeNormalization"],
conf=config.conf,
)
self.bindHelpEvent("BrailleUnicodeNormalization", self.unicodeNormalizationCombo)

seanbudd marked this conversation as resolved.
Show resolved Hide resolved
self.brailleInterruptSpeechCombo: nvdaControls.FeatureFlagCombo = sHelper.addLabeledControl(
labelText=_(
# Translators: This is a label for a combo-box in the Braille settings panel.
Expand Down Expand Up @@ -4184,6 +4207,7 @@ def onSave(self):
self.brailleReviewRoutingMovesSystemCaretCombo.saveCurrentValueToConf()
config.conf["braille"]["readByParagraph"] = self.readByParagraphCheckBox.Value
config.conf["braille"]["wordWrap"] = self.wordWrapCheckBox.Value
self.unicodeNormalizationCombo.saveCurrentValueToConf()
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
config.conf["braille"]["focusContextPresentation"] = self.focusContextPresentationValues[self.focusContextPresentationList.GetSelection()]
self.brailleInterruptSpeechCombo.saveCurrentValueToConf()
self.brailleShowSelectionCombo.saveCurrentValueToConf()
Expand Down
11 changes: 9 additions & 2 deletions source/speech/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import speechDictHandler
import characterProcessing
import languageHandler
from textUtils import unicodeNormalize
from . import manager
from .extensions import speechCanceled, pre_speechCanceled, pre_speech
from .extensions import filter_speechSequence, speechCanceled
seanbudd marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -1568,6 +1569,8 @@ def getTextInfoSpeech( # noqa: C901
# There was content after the indentation, so there is no more indentation.
indentationDone=True
if command:
if config.conf["speech"]["unicodeNormalization"]:
command = unicodeNormalize(command)
if inTextChunk:
relativeSpeechSequence[-1]+=command
else:
Expand Down Expand Up @@ -1775,7 +1778,7 @@ def getPropertiesSpeech( # noqa: C901
reason: OutputReason = OutputReason.QUERY,
**propertyValues
) -> SpeechSequence:
textList: List[str] = []
textList: SpeechSequence = []
name: Optional[str] = propertyValues.get('name')
if name:
textList.append(name)
Expand Down Expand Up @@ -1968,7 +1971,11 @@ def getPropertiesSpeech( # noqa: C901
errorMessage: str | None = propertyValues.get("errorMessage", None)
if errorMessage:
textList.append(errorMessage)

if config.conf["speech"]["unicodeNormalization"]:
textList = [
unicodeNormalize(t) if isinstance(t, str) else t
for t in textList
]
types.logBadSequenceTypes(textList)
return textList

Expand Down
175 changes: 163 additions & 12 deletions source/textUtils.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,32 @@
# -*- coding: UTF-8 -*-
# A part of NonVisual Desktop Access (NVDA)
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# Copyright (C) 2018-2021 NV Access Limited, Babbage B.V., Łukasz Golonka
# Copyright (C) 2018-2024 NV Access Limited, Babbage B.V., Łukasz Golonka

"""
Classes and utilities to deal with offsets variable width encodings, particularly utf_16.
"""

import encodings
import sys
import ctypes
from collections.abc import ByteString
from typing import Tuple, Optional, Type
import encodings
import locale
import unicodedata
from abc import ABCMeta, abstractmethod, abstractproperty
from collections import defaultdict
from difflib import ndiff
from functools import cached_property
from typing import Optional, Tuple, Type

from logHandler import log
from abc import abstractmethod

WCHAR_ENCODING = "utf_16_le"
UTF8_ENCODING = "utf-8"
USER_ANSI_CODE_PAGE = locale.getpreferredencoding()


class OffsetConverter:
class OffsetConverter(metaclass=ABCMeta):
decoded: str

def __init__(self, text: str):
if not isinstance(text, str):
raise TypeError("Value must be of type str")
Expand All @@ -33,7 +35,7 @@ def __init__(self, text: str):
def __repr__(self):
return f"{self.__class__.__name__}({repr(self.decoded)})"

@property
@abstractproperty
def encodedStringLength(self) -> int:
"""Returns the length of the string in itssubclass-specific encoded representation."""
raise NotImplementedError
Expand Down Expand Up @@ -385,8 +387,6 @@ class IdentityOffsetConverter(OffsetConverter):
This is a dummy converter that assumes 1:1 correspondence between encoded and decoded characters.
"""

_encoding: str = UTF8_ENCODING

def __init__(self, text: str):
super().__init__(text)

Expand Down Expand Up @@ -417,6 +417,157 @@ def encodedToStrOffsets(
return (encodedStart, encodedEnd)


DEFAULT_UNICODE_NORMALIZATION_ALGORITHM = "NFKC"


class UnicodeNormalizationOffsetConverter(OffsetConverter):
"""
Object that holds a string in both its decoded and its unicode normalized form.
The object allows for easy conversion between offsets in strings which may or may not be normalized,

For example, when using the NFKC algorithm, the "ij" ligature normalizes to "ij",
which takes two characters instead of one.
"""
normalizationForm: str
computedStrToEncodedOffsets: tuple[int]
computedEncodedToStrOffsets: tuple[int]

def __init__(self, text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM):
super().__init__(text)
self.normalizationForm = normalizationForm
self.encoded: str = unicodedata.normalize(normalizationForm, text)
self.computedStrToEncodedOffsets, self.computedEncodedToStrOffsets = self._calculateOffsets()

def _calculateOffsets(self) -> tuple[tuple[int], tuple[int]]:
diff = list(ndiff(self.decoded, self.encoded))
diff.append("!") # Closing the diff
iOrigin = iNormalized = 0
originBuffer = ""
normalizedBuffer = ""
originToNormalizedDict = defaultdict(list)
normalizedToOriginDict = defaultdict(list)
originPending = normalizedPending = False
for char in diff:
if char[0] == "?":
raise RuntimeError("Unexpected entry in diff")
elif char[0] == "-":
originBuffer += char[2:]
originPending = True
elif char[0] == "+":
normalizedBuffer += char[2:]
normalizedPending = True
elif char[0] == " " and (
(not originPending and normalizedPending) or (originPending and not normalizedPending)
):
originBuffer += char[2:]
normalizedBuffer += char[2:]
else:
while originBuffer and normalizedBuffer:
originPart = ""
originPartLen = 0
normalizedPart = ""
normalizedPartLen = 0
for i in range(len(originBuffer)):
originPart = originBuffer[: (i + 1)]
normalizedPart = unicodedata.normalize(self.normalizationForm, originPart)
if (
originPart == normalizedPart
or not normalizedBuffer.startswith(normalizedPart)
):
continue
originPartLen = len(originPart)
originBuffer = originBuffer[originPartLen:]
normalizedPartLen = len(normalizedPart)
normalizedBuffer = normalizedBuffer[normalizedPartLen:]
break
originMultiplier = min(originPartLen / normalizedPartLen, 1)
normalizedMultiplier = min(normalizedPartLen / originPartLen, 1)
for i in range(max(originPartLen, normalizedPartLen)):
tempOrigin = iOrigin + int(i * originMultiplier)
tempNormalized = iNormalized + int(i * normalizedMultiplier)
originC = originPart[i] if i < originPartLen else None
if originC:
normalizedIndex = normalizedPart.find(originC)
if normalizedIndex != -1:
tempNormalized = iNormalized + normalizedIndex
normalizedC = normalizedPart[i] if i < normalizedPartLen else None
if normalizedC:
originIndex = originPart.find(normalizedC)
if originIndex != -1:
tempOrigin = iOrigin + originIndex
originToNormalizedDict[tempOrigin].append(tempNormalized)
normalizedToOriginDict[tempNormalized].append(tempOrigin)
iOrigin += originPartLen
iNormalized += normalizedPartLen
originPending = normalizedPending = False
if char[0] == " ":
originToNormalizedDict[iOrigin].append(iNormalized)
normalizedToOriginDict[iNormalized].append(iOrigin)
iOrigin += 1
iNormalized += 1
originResult = tuple(map(min, originToNormalizedDict.values()))
assert len(originResult) == len(self.decoded)
normalizedResult = tuple(map(min, normalizedToOriginDict.values()))
assert len(normalizedResult) == len(self.encoded)
return tuple((
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this a tuple as is?

Suggested change
return tuple((
return (

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you are right. I personally notice that I prefer the more explicit syntax because only brackets can also produce a generator if there is a list comprehension between the brackets. But as far as I'm concerned, the final judgment is up to you.
@coderabbitai what do you think about the suggestion made here?

originResult,
normalizedResult
))
Copy link
Contributor

@coderabbitai coderabbitai bot May 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _calculateOffsets method correctly calculates offsets between the original and normalized strings. Consider adding comments to explain the logic for better readability.

def _calculateOffsets(self) -> tuple[tuple[int], tuple[int]]:
    diff = list(ndiff(self.decoded, self.encoded))
    diff.append("!")  # Closing the diff
    iOrigin = iNormalized = 0
    originBuffer = ""
    normalizedBuffer = ""
    originToNormalizedDict = defaultdict(list)
    normalizedToOriginDict = defaultdict(list)
    originPending = normalizedPending = False
    for char in diff:
        if char[0] == "?":
            raise RuntimeError("Unexpected entry in diff")
        elif char[0] == "-":
            originBuffer += char[2:]
            originPending = True
        elif char[0] == "+":
            normalizedBuffer += char[2:]
            normalizedPending = True
        elif char[0] == " " and (
            (not originPending and normalizedPending) or (originPending and not normalizedPending)
        ):
            originBuffer += char[2:]
            normalizedBuffer += char[2:]
        else:
            while originBuffer and normalizedBuffer:
                originPart = ""
                originPartLen = 0
                normalizedPart = ""
                normalizedPartLen = 0
                for i in range(len(originBuffer)):
                    originPart = originBuffer[: (i + 1)]
                    normalizedPart = unicodedata.normalize(self.normalizationForm, originPart)
                    if (
                        originPart == normalizedPart
                        or not normalizedBuffer.startswith(normalizedPart)
                    ):
                        continue
                    originPartLen = len(originPart)
                    originBuffer = originBuffer[originPartLen:]
                    normalizedPartLen = len(normalizedPart)
                    normalizedBuffer = normalizedBuffer[normalizedPartLen:]
                    break
                originMultiplier = min(originPartLen / normalizedPartLen, 1)
                normalizedMultiplier = min(normalizedPartLen / originPartLen, 1)
                for i in range(max(originPartLen, normalizedPartLen)):
                    tempOrigin = iOrigin + int(i * originMultiplier)
                    tempNormalized = iNormalized + int(i * normalizedMultiplier)
                    originC = originPart[i] if i < originPartLen else None
                    if originC:
                        normalizedIndex = normalizedPart.find(originC)
                        if normalizedIndex != -1:
                            tempNormalized = iNormalized + normalizedIndex
                    normalizedC = normalizedPart[i] if i < normalizedPartLen else None
                    if normalizedC:
                        originIndex = originPart.find(normalizedC)
                        if originIndex != -1:
                            tempOrigin = iOrigin + originIndex
                    originToNormalizedDict[tempOrigin].append(tempNormalized)
                    normalizedToOriginDict[tempNormalized].append(tempOrigin)
                iOrigin += originPartLen
                iNormalized += normalizedPartLen
            originPending = normalizedPending = False
            if char[0] == " ":
                originToNormalizedDict[iOrigin].append(iNormalized)
                normalizedToOriginDict[iNormalized].append(iOrigin)
                iOrigin += 1
                iNormalized += 1
    originResult = tuple(map(min, originToNormalizedDict.values()))
    assert len(originResult) == len(self.decoded)
    normalizedResult = tuple(map(min, normalizedToOriginDict.values()))
    assert len(normalizedResult) == len(self.encoded)
    return tuple((
        originResult,
        normalizedResult
    ))

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that _calculateOffsets needs more commenting. @coderabbitai Could you please generate a suggestion with commenting?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@seanbudd, here's the _calculateOffsets method with added comments to enhance readability and understanding:

def _calculateOffsets(self) -> tuple[tuple[int], tuple[int]]:
    # Initialize a diff list between the decoded original and the normalized strings
    diff = list(ndiff(self.decoded, self.encoded))
    diff.append("!")  # Append a closing marker to the diff list

    # Initialize indices and buffers for tracking positions and changes
    iOrigin = iNormalized = 0
    originBuffer = ""
    normalizedBuffer = ""
    originToNormalizedDict = defaultdict(list)
    normalizedToOriginDict = defaultdict(list)
    originPending = normalizedPending = False

    # Iterate over each character in the diff list
    for char in diff:
        if char[0] == "?":
            raise RuntimeError("Unexpected entry in diff")
        elif char[0] == "-":
            # Accumulate deleted characters in the origin buffer
            originBuffer += char[2:]
            originPending = True
        elif char[0] == "+":
            # Accumulate added characters in the normalized buffer
            normalizedBuffer += char[2:]
            normalizedPending = True
        elif char[0] == " " and (
            (not originPending and normalizedPending) or (originPending and not normalizedPending)
        ):
            # Accumulate unchanged characters in both buffers
            originBuffer += char[2:]
            normalizedBuffer += char[2:]
        else:
            # Process accumulated characters in the buffers
            while originBuffer and normalizedBuffer:
                originPart = ""
                originPartLen = 0
                normalizedPart = ""
                normalizedPartLen = 0
                # Find the smallest part that can be normalized and still match the beginning of the normalized buffer
                for i in range(len(originBuffer)):
                    originPart = originBuffer[: (i + 1)]
                    normalizedPart = unicodedata.normalize(self.normalizationForm, originPart)
                    if (
                        originPart == normalizedPart
                        or not normalizedBuffer.startswith(normalizedPart)
                    ):
                        continue
                    originPartLen = len(originPart)
                    originBuffer = originBuffer[originPartLen:]
                    normalizedPartLen = len(normalizedPart)
                    normalizedBuffer = normalizedBuffer[normalizedPartLen:]
                    break
                # Map the original indices to the normalized indices
                originMultiplier = min(originPartLen / normalizedPartLen, 1)
                normalizedMultiplier = min(normalizedPartLen / originPartLen, 1)
                for i in range(max(originPartLen, normalizedPartLen)):
                    tempOrigin = iOrigin + int(i * originMultiplier)
                    tempNormalized = iNormalized + int(i * normalizedMultiplier)
                    originC = originPart[i] if i < originPartLen else None
                    if originC:
                        normalizedIndex = normalizedPart.find(originC)
                        if normalizedIndex != -1:
                            tempNormalized = iNormalized + normalizedIndex
                    normalizedC = normalizedPart[i] if i < normalizedPartLen else None
                    if normalizedC:
                        originIndex = originPart.find(normalizedC)
                        if originIndex != -1:
                            tempOrigin = iOrigin + originIndex
                    originToNormalizedDict[tempOrigin].append(tempNormalized)
                    normalizedToOriginDict[tempNormalized].append(tempOrigin)
                iOrigin += originPartLen
                iNormalized += normalizedPartLen
            originPending = normalizedPending = False
            if char[0] == " ":
                # Map indices directly for unchanged characters
                originToNormalizedDict[iOrigin].append(iNormalized)
                normalizedToOriginDict[iNormalized].append(iOrigin)
                iOrigin += 1
                iNormalized += 1
    # Finalize the mapping by selecting the minimum index for each original position
    originResult = tuple(map(min, originToNormalizedDict.values()))
    assert len(originResult) == len(self.decoded)
    normalizedResult = tuple(map(min, normalizedToOriginDict.values()))
    assert len(normalizedResult) == len(self.encoded)
    return tuple((
        originResult,
        normalizedResult
    ))

This version includes detailed comments that explain each part of the method, making it easier to understand how the offsets are calculated between the original and normalized strings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LeonarddeR - can you add comments? When that's done and #16521 (comment) is addressed I think this is ready

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@coderabbitai I have just reworded some of your comments in a new commit. DO you think its appropriate now?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LeonarddeR, I have reviewed the reworded comments in the _calculateOffsets method. The comments now provide clear explanations for each part of the method, enhancing readability and understanding. The logic and functionality of the method are well-documented, making it easier for future developers to follow the code.

The comments are appropriate and effectively describe the purpose and process of each section of the method. Great job on the rewording!

If there are any further adjustments or additional comments needed, feel free to let me know.

Analysis chain
Scripts executed

The following scripts were executed for the analysis:


Script:

#!/bin/bash
# Fetch the latest version of the `_calculateOffsets` method from the `source/textUtils.py` file
cat source/textUtils.py | sed -n '/def _calculateOffsets/,/return tuple((/p'

Length of output: 4735


New Learnings Added
User: LeonarddeR
PR: nvaccess/nvda#16521
File: source/textUtils.py:0-0
Timestamp: 2024-05-18T11:21:34.925Z
Learning: LeonarddeR reworded the comments in the `_calculateOffsets` method to enhance readability and understanding. The comments now provide clear explanations for each part of the method.

Note: Learnings are effective only in the context of specific code segments. To apply general coding guidelines, please configure review instructions.


@cached_property
def encodedStringLength(self) -> int:
"""Returns the length of the string in its normalized representation."""
return len(self.encoded)

def strToEncodedOffsets(
self,
strStart: int,
strEnd: int | None = None,
raiseOnError: bool = False,
) -> int | Tuple[int]:
super().strToEncodedOffsets(strStart, strEnd, raiseOnError)
if strStart == 0:
resultStart = 0
else:
resultStart = self.computedStrToEncodedOffsets[strStart]
if strEnd is None:
return resultStart
elif strStart == strEnd:
return (resultStart, resultStart)
else:
resultEnd = self.computedStrToEncodedOffsets[strEnd]
return (resultStart, resultEnd)

def encodedToStrOffsets(
self,
encodedStart: int,
encodedEnd: int | None = None,
raiseOnError: bool = False
) -> int | Tuple[int]:
super().encodedToStrOffsets(encodedStart, encodedEnd, raiseOnError)
if encodedStart == 0:
resultStart = 0
else:
resultStart = self.computedEncodedToStrOffsets[encodedStart]
if encodedEnd is None:
return resultStart
elif encodedStart == encodedEnd:
return (resultStart, resultStart)
else:
resultEnd = self.computedEncodedToStrOffsets[encodedEnd]
return (resultStart, resultEnd)


def isUnicodeNormalized(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> bool:
"""Convenience function to wrap unicodedata.is_normalized with a default normalization form."""
return unicodedata.is_normalized(normalizationForm, text)


def unicodeNormalize(text: str, normalizationForm: str = DEFAULT_UNICODE_NORMALIZATION_ALGORITHM) -> str:
"""Convenience function to wrap unicodedata.normalize with a default normalization form."""
return unicodedata.normalize(normalizationForm, text)


ENCODINGS_TO_CONVERTERS: dict[str, Type[OffsetConverter]] = {
WCHAR_ENCODING: WideStringOffsetConverter,
UTF8_ENCODING: UTF8OffsetConverter,
Expand Down
Loading