diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py index a57c954..010cc29 100644 --- a/src/tokenizer/abbrev.py +++ b/src/tokenizer/abbrev.py @@ -33,7 +33,7 @@ """ -from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar +from typing import Generic, Iterator, Optional, TypeVar from threading import Lock from collections import defaultdict, OrderedDict @@ -56,7 +56,7 @@ class OrderedSet(Generic[_T]): if a standard Python set() was used.""" def __init__(self) -> None: - self._dict: Dict[_T, None] = OrderedDict() + self._dict: dict[_T, None] = OrderedDict() def add(self, item: _T) -> None: """Add an item at the end of the ordered set""" @@ -75,29 +75,29 @@ class Abbreviations: initialized from the config file""" # Dictionary of abbreviations and their meanings - DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) + DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) # Wrong versions of abbreviations - WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) + WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) # All abbreviation meanings - MEANINGS: Set[str] = set() + MEANINGS: set[str] = set() # Single-word abbreviations, i.e. those with only one dot at the end - SINGLES: Set[str] = set() + SINGLES: set[str] = set() # Set of abbreviations without periods, e.g. "td", "osfrv" - WRONGSINGLES: Set[str] = set() + WRONGSINGLES: set[str] = set() # Potential sentence finishers, i.e. those with a dot at the end, # marked with an asterisk in the config file - FINISHERS: Set[str] = set() + FINISHERS: set[str] = set() # Abbreviations that should not be seen as such at the end of sentences, # marked with an exclamation mark in the config file - NOT_FINISHERS: Set[str] = set() + NOT_FINISHERS: set[str] = set() # Abbreviations that should not be seen as such at the end of sentences, but # are allowed in front of person names; marked with a hat ^ in the config file - NAME_FINISHERS: Set[str] = set() + NAME_FINISHERS: set[str] = set() # Wrong versions of abbreviations with possible corrections # wrong version : [correction1, correction2, ...] - WRONGDOTS: Dict[str, List[str]] = defaultdict(list) + WRONGDOTS: dict[str, list[str]] = defaultdict(list) # Word forms that should never be interpreted as abbreviations - NOT_ABBREVIATIONS: Set[str] = set() + NOT_ABBREVIATIONS: set[str] = set() # Ensure that only one thread initializes the abbreviations _lock = Lock() @@ -208,7 +208,7 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non i1 = indices[0] i2 = indices[1] i3 = indices[2] - wabbrevs: List[str] = [] + wabbrevs: list[str] = [] # 1 and 2 removed wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :]) # 1 and 3 removed @@ -257,7 +257,7 @@ def has_abbreviation(meaning: str) -> bool: return meaning in Abbreviations.MEANINGS @staticmethod - def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]: + def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]: """Look up meaning(s) of abbreviation, if available.""" m = Abbreviations.DICT.get(abbrev) if not m: diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 812aeb7..b327939 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -29,13 +29,10 @@ """ from typing import ( - Dict, - FrozenSet, Mapping, Tuple, Union, Callable, - List, Sequence, Optional, NamedTuple, @@ -47,13 +44,13 @@ BeginTuple = Tuple[int, Optional[int]] PunctuationTuple = Tuple[int, str] -NumberTuple = Tuple[float, Optional[List[str]], Optional[List[str]]] +NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]] DateTimeTuple = Tuple[int, int, int] MeasurementTuple = Tuple[str, float] TimeStampTuple = Tuple[int, int, int, int, int, int] -AmountTuple = Tuple[float, str, Optional[List[str]], Optional[List[str]]] +AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]] TelnoTuple = Tuple[str, str] -CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]] +CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]] class BIN_Tuple(NamedTuple): @@ -434,7 +431,7 @@ class PersonNameTuple(NamedTuple): SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞" # Derived unit : (base SI unit, conversion factor/function) -SI_UNITS: Dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = { +SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = { # Distance "m": ("m", 1.0), "mm": ("m", 1.0e-3), @@ -538,7 +535,7 @@ class PersonNameTuple(NamedTuple): unit + r"(?!\w)" if unit[-1].isalpha() else unit ) -SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys()) +SI_UNITS_SET: frozenset[str] = frozenset(SI_UNITS.keys()) SI_UNITS_REGEX_STRING = r"|".join( map( # If the unit ends with a letter, don't allow the next character diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index b6a94eb..aec191d 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -35,7 +35,7 @@ """ -from typing import TextIO, Dict, Iterator, List, Callable, Any, Tuple, Union, cast +from typing import TextIO, Iterator, Callable, Any, Tuple, Union, cast import sys import argparse @@ -158,14 +158,14 @@ def main() -> None: """Main function, called when the tokenize command is invoked""" args = parser.parse_args() - options: Dict[str, bool] = dict() + options: dict[str, bool] = dict() def quote(s: str) -> str: """Return the string s within double quotes, and with any contained backslashes and double quotes escaped with a backslash""" return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"' - def spanquote(l: List[int]) -> str: + def spanquote(l: list[int]) -> str: """Return the list l as a string within double quotes""" return '"' + "-".join(str(x) for x in l) + '"' @@ -180,7 +180,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: return None if t.kind == TOK.WORD: # Get the full expansion of an abbreviation - mm = cast(List[BIN_Tuple], t.val) + mm = cast(list[BIN_Tuple], t.val) if quote_word: # Return a |-delimited list of possible meanings, # joined into a single string @@ -254,7 +254,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: # Configure our JSON dump function json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":")) - curr_sent: List[str] = [] + curr_sent: list[str] = [] tsep = "" if args.original else " " # token separator for t in tokenize(gen(args.infile), **options): if args.csv: @@ -275,7 +275,7 @@ def val(t: Tok, quote_word: bool = False) -> Any: print('0,"","","",""', file=args.outfile) elif args.json: # Output the tokens in JSON format, one line per token - d: Dict[str, Union[str, List[int]]] = dict(k=TOK.descr[t.kind]) + d: dict[str, Union[str, list[int]]] = dict(k=TOK.descr[t.kind]) if t.txt is not None: d["t"] = t.txt v = val(t) diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 1a6dfc8..35bee09 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -42,10 +42,8 @@ Any, Callable, Deque, - FrozenSet, Iterable, Iterator, - List, Mapping, Match, Optional, @@ -77,7 +75,6 @@ class Tok: - """Information about a single token""" def __init__( @@ -86,7 +83,7 @@ def __init__( txt: Optional[str], val: ValType, original: Optional[str] = None, - origin_spans: Optional[List[int]] = None, + origin_spans: Optional[list[int]] = None, ) -> None: # Type of token self.kind: int = kind @@ -101,7 +98,7 @@ def __init__( # Each such integer index maps the corresponding character # (which may have substitutions) to its index in 'original'. # This is required to preserve 'original' correctly when splitting. - self.origin_spans: Optional[List[int]] = origin_spans + self.origin_spans: Optional[list[int]] = origin_spans @classmethod def from_txt(cls: Type[_T], txt: str) -> _T: @@ -312,7 +309,7 @@ def concatenate( self_origin_spans = self.origin_spans or [] other_origin_spans = other.origin_spans or [] - separator_origin_spans: List[int] = ( + separator_origin_spans: list[int] = ( [len(self_original)] * len(separator) if len(other_origin_spans) > 0 else [] ) new_origin_spans = ( @@ -373,7 +370,6 @@ def quoted_string_repr(obj: Any) -> str: class TOK: - """ The TOK class contains constants that define token types and constructors for creating token instances. @@ -647,8 +643,8 @@ def Email(t: Union[Tok, str]) -> Tok: def Number( t: Union[Tok, str], n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this number # (if it was originally stated in words) @@ -670,8 +666,8 @@ def NumberWithLetter(t: Union[Tok, str], n: int, c: str) -> Tok: def Currency( t: Union[Tok, str], iso: str, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this currency name # (if it was originally stated in words, i.e. not abbreviated) @@ -686,8 +682,8 @@ def Amount( t: Union[Tok, str], iso: str, n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: # The cases parameter is a list of possible cases for this amount # (if it was originally stated in words) @@ -701,8 +697,8 @@ def Amount( def Percent( t: Union[Tok, str], n: float, - cases: Optional[List[str]] = None, - genders: Optional[List[str]] = None, + cases: Optional[list[str]] = None, + genders: Optional[list[str]] = None, ) -> Tok: if isinstance(t, str): return Tok(TOK.PERCENT, t, (n, cases, genders)) @@ -1559,7 +1555,7 @@ def generate_raw_tokens( def could_be_end_of_sentence( next_token: Tok, - test_set: FrozenSet[int] = TOK.TEXT, + test_set: frozenset[int] = TOK.TEXT, multiplier: bool = False, ) -> bool: """Return True if next_token could be ending the current sentence or @@ -1578,7 +1574,6 @@ def could_be_end_of_sentence( class LetterParser: - """Parses a sequence of alphabetic characters off the front of a raw token""" @@ -1663,7 +1658,6 @@ def parse(self) -> Iterable[Tok]: class NumberParser: - """Parses a sequence of digits off the front of a raw token""" def __init__( @@ -1724,7 +1718,6 @@ def parse(self) -> Iterable[Tok]: class PunctuationParser: - """Parses a sequence of punctuation off the front of a raw token""" def __init__(self) -> None: @@ -2108,7 +2101,7 @@ def is_abbr_with_period(txt: str) -> bool: return txt not in Abbreviations.DICT return False - def lookup(abbrev: str) -> Optional[List[BIN_Tuple]]: + def lookup(abbrev: str) -> Optional[list[BIN_Tuple]]: """Look up an abbreviation, both in original case and in lower case, and return either None if not found or a meaning list having one entry""" m = Abbreviations.DICT.get(abbrev) @@ -2647,7 +2640,7 @@ def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: if abbrev in Abbreviations.FINISHERS: token = TOK.Word( token.concatenate(next_token), - cast(Optional[List[BIN_Tuple]], token.val), + cast(Optional[list[BIN_Tuple]], token.val), ) next_token = next(token_stream) @@ -2975,7 +2968,7 @@ def parse_phrases_2( # Check for composites: # 'stjórnskipunar- og eftirlitsnefnd' # 'dómsmála-, viðskipta- og iðnaðarráðherra' - tq: List[Tok] = [] + tq: list[Tok] = [] while token.kind == TOK.WORD and next_token.punctuation == COMPOSITE_HYPHEN: # Accumulate the prefix in tq tq.append(token) @@ -3081,7 +3074,7 @@ def split_into_sentences( to_text = lambda t: t.original or t.txt else: to_text = lambda t: t.txt - curr_sent: List[str] = [] + curr_sent: list[str] = [] for t in tokenize_without_annotation(text_or_gen, **options): if t.kind in TOK.END: # End of sentence/paragraph @@ -3111,14 +3104,14 @@ def mark_paragraphs(txt: str) -> str: return "[[" + "]][[".join(t for t in txt.split("\n") if t) + "]]" -def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[Tuple[int, List[Tok]]]]: +def paragraphs(tokens: Iterable[Tok]) -> Iterator[list[Tuple[int, list[Tok]]]]: """Generator yielding paragraphs from token iterable. Each paragraph is a list of sentence tuples. Sentence tuples consist of the index of the first token of the sentence (the TOK.S_BEGIN token) and a list of the tokens within the sentence, not including the starting TOK.S_BEGIN or the terminating TOK.S_END tokens.""" - def valid_sent(sent: Optional[List[Tok]]) -> bool: + def valid_sent(sent: Optional[list[Tok]]) -> bool: """Return True if the token list in sent is a proper sentence that we want to process further""" if not sent: @@ -3126,9 +3119,9 @@ def valid_sent(sent: Optional[List[Tok]]) -> bool: # A sentence with only punctuation is not valid return any(t[0] != TOK.PUNCTUATION for t in sent) - sent: List[Tok] = [] # Current sentence + sent: list[Tok] = [] # Current sentence sent_begin = 0 - current_p: List[Tuple[int, List[Tok]]] = [] # Current paragraph + current_p: list[Tuple[int, list[Tok]]] = [] # Current paragraph for ix, t in enumerate(tokens): t0 = t[0] @@ -3184,7 +3177,7 @@ def correct_spaces(s: str) -> str: with correct spacing between tokens. NOTE that this function uses a quick-and-dirty approach which may not handle all edge cases!""" - r: List[str] = [] + r: list[str] = [] last = TP_NONE double_quote_count = 0 for w in RE_SPLIT.split(s): @@ -3244,7 +3237,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: to a correctly spaced string. If normalize is True, punctuation is normalized before assembling the string.""" to_text: Callable[[Tok], str] = normalized_text if normalize else lambda t: t.txt - r: List[str] = [] + r: list[str] = [] last = TP_NONE double_quote_count = 0 for t in tokens: @@ -3278,7 +3271,7 @@ def detokenize(tokens: Iterable[Tok], normalize: bool = False) -> str: def calculate_indexes( tokens: Iterable[Tok], last_is_end: bool = False -) -> Tuple[List[int], List[int]]: +) -> Tuple[list[int], list[int]]: """Calculate character and byte indexes for a token stream. The indexes are the start positions of each token in the original text that was tokenized.